diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1890910480..6298467a83 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,6 +17,12 @@ jobs: with: submodules: recursive + - name: Get specific submodule hash + id: core-v-submodule-hash + run: | + cd verif/core-v-verif + echo "hash=$(git rev-parse HEAD)" >> $GITHUB_OUTPUT + - name: Cache toolchain id: cache-toolchain uses: actions/cache@v3 @@ -42,8 +48,7 @@ jobs: cache-name: cache-spike with: path: tools/spike/ - key: ${{ runner.os }}-build-${{ env.cache-name }}-${{ hashFiles('verif/regress/install-spike.sh', - 'verif/core-v-verif/') }} + key: ${{ runner.os }}-build-${{ env.cache-name }}-${{ hashFiles('verif/regress/install-spike.sh')}}-${{ steps.core-v-submodule-hash.outputs.hash }} - name: Prepare run: | @@ -71,6 +76,12 @@ jobs: with: submodules: recursive + - name: Get specific submodule hash + id: core-v-submodule-hash + run: | + cd verif/core-v-verif + echo "hash=$(git rev-parse HEAD)" >> $GITHUB_OUTPUT + - name: Cache toolchain id: cache-toolchain uses: actions/cache@v3 @@ -96,8 +107,7 @@ jobs: cache-name: cache-spike with: path: tools/spike/ - key: ${{ runner.os }}-build-${{ env.cache-name }}-${{ hashFiles('verif/regress/install-spike.sh', - 'verif/core-v-verif/') }} + key: ${{ runner.os }}-build-${{ env.cache-name }}-${{ hashFiles('verif/regress/install-spike.sh')}}-${{ steps.core-v-submodule-hash.outputs.hash }} - name: Run Tests run: | @@ -132,6 +142,12 @@ jobs: with: submodules: recursive + - name: Get specific submodule hash + id: core-v-submodule-hash + run: | + cd verif/core-v-verif + echo "hash=$(git rev-parse HEAD)" >> $GITHUB_OUTPUT + - name: Cache toolchain id: cache-toolchain uses: actions/cache@v3 @@ -157,8 +173,7 @@ jobs: cache-name: cache-spike with: path: tools/spike/ - key: ${{ runner.os }}-build-${{ env.cache-name }}-${{ hashFiles('verif/regress/install-spike.sh', - 'verif/core-v-verif/') }} + key: ${{ runner.os }}-build-${{ env.cache-name }}-${{ hashFiles('verif/regress/install-spike.sh')}}-${{ steps.core-v-submodule-hash.outputs.hash }} - name: Run Tests run: | diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 99fe7be953..379978f706 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -25,6 +25,10 @@ include: - project: '$CI_PROJECT_NAMESPACE/setup-ci' ref: '$SETUP_CI_CVV_BRANCH' file: 'cva6/core-v-verif-cva6.yml' + - local: '.gitlab-ci-custom.yml' + rules: + - exists: + - '.gitlab-ci-custom.yml' workflow: rules: @@ -34,6 +38,9 @@ workflow: - if: $CI_COMMIT_BRANCH == "master" variables: CI_KIND: regress + - if: $CI_PIPELINE_SOURCE == "merge_request_event" + variables: + CI_KIND: regress - if: $CI_COMMIT_BRANCH =~ /.*_PR_.*/ variables: CI_KIND: dev @@ -107,28 +114,32 @@ build_tools: - mkdir -p artifacts/tools/ - mv tools/spike artifacts/tools/ +.copy_spike_artifacts: ©_spike_artifacts + - mkdir -p tools + - mv artifacts/tools/spike tools + - /sbin/ldconfig -N tools/spike/lib + .fe_smoke_test: stage: light tests rules: *on_dev before_script: - git -C verif/core-v-verif fetch --unshallow - - mkdir -p tools - - mv artifacts/tools/spike tools + - !reference [.copy_spike_artifacts] - rm -rf artifacts/ - mkdir -p artifacts/{reports,logs} - python3 .gitlab-ci/scripts/report_fail.py - echo $SYN_VCS_BASHRC; source $SYN_VCS_BASHRC .simu_after_script: &simu_after_script - - for i in $(find verif/sim/out*/[vq]*_sim -type f \( -name "*.csv" -o -name "*.iss" \)) ; do head -10000 $i > artifacts/logs/$(basename $i).head ; done + - for i in $(find verif/sim/out*/[vq]*_sim -type f \( -name "*.csv" -o -name "*.iss" -o -name "*.yaml" \)) ; do head -10000 $i > artifacts/logs/$(basename $i).head ; done - head -10000 verif/sim/logfile.log > artifacts/logs/logfile.log.head - - python3 .gitlab-ci/scripts/report_simu.py verif/sim/logfile.log + - if [ -n "$SPIKE_TANDEM" ]; then python3 .gitlab-ci/scripts/report_tandem.py verif/sim/out*/"$DV_SIMULATORS"_sim; else python3 .gitlab-ci/scripts/report_simu.py verif/sim/logfile.log; fi -smoke: +smoke-tests: extends: - .fe_smoke_test variables: - DASHBOARD_JOB_TITLE: "Smoke test $DV_SIMULATORS" + DASHBOARD_JOB_TITLE: "Smoke test $DV_SIMULATORS $DV_TARGET" DASHBOARD_JOB_DESCRIPTION: "Short tests to challenge most architectures with most testbenchs configurations" DASHBOARD_SORT_INDEX: 0 DASHBOARD_JOB_CATEGORY: "Basic" @@ -136,17 +147,16 @@ smoke: COLLECT_SIMU_LOGS: 1 parallel: matrix: - - DV_SIMULATORS: - - "veri-testharness,spike" - - "vcs-testharness,spike" - - "questa-testharness,spike" - - "vcs-uvm,spike" + - DV_SIMULATORS: ["vcs-testharness", "questa-testharness"] + DV_TARGET: ["cv32a6_imac_sv32", "cv64a6_imafdc_sv39"] + - DV_SIMULATORS: "vcs-uvm" + DV_TARGET: "cv32a65x" script: - - source $QUESTA_BASHRC - - bash verif/regress/smoke-tests.sh + - if [[ $DV_SIMULATORS == *"questa"* ]]; then source $QUESTA_BASHRC; fi + - bash verif/regress/smoke-tests-$DV_TARGET.sh - !reference [.simu_after_script] -gen_smoke: +smoke-gen: extends: - .fe_smoke_test variables: @@ -154,25 +164,32 @@ gen_smoke: DASHBOARD_JOB_DESCRIPTION: "Short generated tests to challenge the CVA6-DV on STEP1 configuration" DASHBOARD_SORT_INDEX: 0 DASHBOARD_JOB_CATEGORY: "Basic" - DV_SIMULATORS: "vcs-uvm,spike" + DV_SIMULATORS: "vcs-uvm" COLLECT_SIMU_LOGS: 1 + SPIKE_TANDEM: 1 script: - bash verif/regress/smoke-gen_tests.sh - !reference [.simu_after_script] -coremark: +smoke-bench: extends: - .fe_smoke_test variables: - DASHBOARD_JOB_TITLE: "CoreMark" + DASHBOARD_JOB_TITLE: "smoke-bench" DASHBOARD_JOB_DESCRIPTION: "Performance indicator" DASHBOARD_SORT_INDEX: 5 DASHBOARD_JOB_CATEGORY: "Performance" + SPIKE_TANDEM: 1 + BENCH: "dhrystone" + parallel: + matrix: + - DV_TARGET: "cv32a60x" + - DV_TARGET: "cv32a65x" script: - - bash verif/regress/coremark.sh --no-print - - python3 .gitlab-ci/scripts/report_benchmark.py --coremark verif/sim/out_*/veri-testharness_sim/core_main.*.log + - bash verif/regress/"$BENCH"_smoke.sh --no-print + - python3 .gitlab-ci/scripts/report_benchmark.py --"$BENCH"_"$DV_TARGET" verif/sim/out_*/vcs-uvm_sim/"$BENCH"_main.*.log -hwconfig: +smoke-hwconfig: extends: - .fe_smoke_test variables: @@ -180,8 +197,10 @@ hwconfig: DASHBOARD_JOB_DESCRIPTION: "Short tests to challenge target configurations" DASHBOARD_SORT_INDEX: 1 DASHBOARD_JOB_CATEGORY: "Basic" - DV_SIMULATORS: "veri-testharness,spike" - DV_HWCONFIG_OPTS: "cv32a6_imac_sv32" + DV_SIMULATORS: "vcs-uvm" + SPIKE_TANDEM: 1 + DV_TARGET: "hwconfig" + DV_HWCONFIG_OPTS: "cv32a65x" script: - source verif/regress/hwconfig_tests.sh - python3 .gitlab-ci/scripts/report_pass.py @@ -218,8 +237,29 @@ spyglass: - make -C spyglass design_read - make -C spyglass lint_check - mv spyglass/sg_run_results/cva6_sg_reports/cva6_lint_lint_rtl artifacts/lint_reports + - cp spyglass/reference_summary.rpt artifacts/lint_reports - python3 .gitlab-ci/scripts/report_spyglass_lint.py spyglass/reference_summary.rpt artifacts/lint_reports/cva6_lint_lint_rtl/summary.rpt +cvxif-regression: + extends: + - .synthesis_test + variables: + DASHBOARD_JOB_TITLE: "CVXIF non-regression test $DV_SIMULATORS" + DASHBOARD_JOB_DESCRIPTION: "Short tests to challenge most CoreV-X-Interface in testharness" + DASHBOARD_SORT_INDEX: 5 + DASHBOARD_JOB_CATEGORY: "Basic" + COLLECT_SIMU_LOGS: 1 + SPIKE_TANDEM: 1 + parallel: + matrix: + - DV_SIMULATORS: + - "veri-testharness,spike" + - "vcs-testharness" + script: + - bash verif/regress/cvxif_verif_regression.sh + - if [[ $DV_SIMULATORS == *"spike"* ]]; then unset SPIKE_TANDEM; fi # dirty hack to do trace comparison between tandem execution and spike standalone + - !reference [.simu_after_script] + asic-synthesis: extends: - .synthesis_test @@ -234,7 +274,7 @@ asic-synthesis: - echo $PERIOD - echo $DV_TARGET - source ./verif/sim/setup-env.sh - - git clone ${SYNTH_SCRIPT} ${SYNTH_SCRIPT_PATH} + - git clone ${SYNTH_SCRIPT} ${SYNTH_SCRIPT_PATH} -b ${SYNTH_SCRIPT_BRANCH} - cp -r ${SYNTH_SCRIPT_PATH}/cva6/ ../ - git apply ${SYNTH_SCRIPT_PATH}/patches/*.patch - echo $SYN_DCSHELL_BASHRC; source $SYN_DCSHELL_BASHRC @@ -260,10 +300,10 @@ fpga-build: - source $VIVADO_SETUP - source ./verif/sim/setup-env.sh - mkdir -p artifacts/logs - - make fpga target=$TARGET |& tail -20 > artifacts/logs/logfile.log.tail + - make fpga target=$TARGET &> artifacts/logs/logfile.log - mkdir -p artifacts/reports - mv corev_apu/fpga/work-fpga/ariane_xilinx.bit artifacts/ariane_xilinx_$TARGET.bit - - python3 .gitlab-ci/scripts/report_fpga.py corev_apu/fpga/reports/ariane.utilization.rpt artifacts/logs/logfile.log.tail + - python3 .gitlab-ci/scripts/report_fpga.py corev_apu/fpga/reports/ariane.utilization.rpt .regress_test: stage: heavy tests @@ -275,17 +315,32 @@ fpga-build: - when: manual allow_failure: true -dhrystone: +benchmarks: extends: - .regress_test variables: - DASHBOARD_JOB_TITLE: "Dhrystone" + DASHBOARD_JOB_TITLE: "benchmark $BENCH $ISSUE" DASHBOARD_JOB_DESCRIPTION: "Performance indicator" DASHBOARD_SORT_INDEX: 5 DASHBOARD_JOB_CATEGORY: "Performance" + SPIKE_TANDEM: 1 + parallel: + matrix: + - BENCH: "dhrystone" + ISSUE: "single" + DV_HWCONFIG_OPTS: ["cv32a60x IcacheByteSize=16384 IcacheSetAssoc=8 DcacheByteSize=32768 DcacheSetAssoc=8 BHTEntries=128 NrScoreboardEntries=8 DCacheType=config_pkg::WT"] + - BENCH: "dhrystone" + ISSUE: "dual" + DV_HWCONFIG_OPTS: ["cv32a65x IcacheByteSize=16384 IcacheSetAssoc=8 DcacheByteSize=32768 DcacheSetAssoc=8 BHTEntries=128 NrScoreboardEntries=8 DCacheType=config_pkg::WT"] + - BENCH: "coremark" + ISSUE: "single" + DV_HWCONFIG_OPTS: ["cv32a60x IcacheByteSize=16384 IcacheSetAssoc=8 DcacheByteSize=32768 DcacheSetAssoc=8 BHTEntries=128 NrScoreboardEntries=8 DCacheType=config_pkg::WT"] + - BENCH: "coremark" + ISSUE: "dual" + DV_HWCONFIG_OPTS: ["cv32a65x IcacheByteSize=16384 IcacheSetAssoc=8 DcacheByteSize=32768 DcacheSetAssoc=8 BHTEntries=128 NrScoreboardEntries=8 DCacheType=config_pkg::WT"] script: - - bash verif/regress/dhrystone.sh - - python3 .gitlab-ci/scripts/report_benchmark.py --dhrystone verif/sim/out_*/veri-testharness_sim/dhrystone_main.*.log + - bash verif/regress/"$BENCH".sh + - python3 .gitlab-ci/scripts/report_benchmark.py --"$BENCH"_"$ISSUE" verif/sim/out_*/vcs-uvm_sim/"$BENCH"_main.*.log riscv_arch_test: extends: @@ -295,8 +350,8 @@ riscv_arch_test: DASHBOARD_JOB_DESCRIPTION: "Compliance regression suite" DASHBOARD_SORT_INDEX: 0 DASHBOARD_JOB_CATEGORY: "Test suites" - DV_SIMULATORS: "veri-testharness,spike" - COLLECT_SIMU_LOGS: 1 + DV_SIMULATORS: "vcs-testharness" + SPIKE_TANDEM: 1 script: source verif/regress/dv-riscv-arch-test.sh after_script: *simu_after_script @@ -308,12 +363,13 @@ compliance: DASHBOARD_JOB_DESCRIPTION: "Compliance regression suite" DASHBOARD_SORT_INDEX: 2 DASHBOARD_JOB_CATEGORY: "Test suites" - DV_SIMULATORS: "veri-testharness,spike" - COLLECT_SIMU_LOGS: 1 + DV_SIMULATORS: "vcs-testharness" + SPIKE_TANDEM: 1 script: source verif/regress/dv-riscv-compliance.sh after_script: *simu_after_script riscv-tests-v: + timeout : 2 hours extends: - .regress_test variables: @@ -321,10 +377,9 @@ riscv-tests-v: DASHBOARD_JOB_DESCRIPTION: "Riscv-test regression suite (virtual)" DASHBOARD_SORT_INDEX: 3 DASHBOARD_JOB_CATEGORY: "Test suites" - DV_SIMULATORS: "veri-testharness,spike" + DV_SIMULATORS: "vcs-testharness,spike" DV_TARGET: cv64a6_imafdc_sv39 DV_TESTLISTS: "../tests/testlist_riscv-tests-$DV_TARGET-v.yaml" - COLLECT_SIMU_LOGS: 1 script: source verif/regress/dv-riscv-tests.sh after_script: *simu_after_script @@ -336,9 +391,9 @@ riscv-tests-p: DASHBOARD_JOB_DESCRIPTION: "Riscv-test regression suite (physical)" DASHBOARD_SORT_INDEX: 4 DASHBOARD_JOB_CATEGORY: "Test suites" - DV_SIMULATORS: "veri-testharness,spike" + DV_SIMULATORS: "vcs-testharness" + SPIKE_TANDEM: 1 DV_TESTLISTS: "../tests/testlist_riscv-tests-$DV_TARGET-p.yaml" - COLLECT_SIMU_LOGS: 1 script: source verif/regress/dv-riscv-tests.sh after_script: *simu_after_script @@ -371,6 +426,8 @@ generated_tests: variables: DASHBOARD_SORT_INDEX: 11 DASHBOARD_JOB_CATEGORY: "Code Coverage" + SPIKE_TANDEM: 1 + DV_SIMULATORS: "vcs-uvm" parallel: matrix: - list_num: 1 @@ -398,12 +455,14 @@ generated_tests: - mv verif/sim/seedlist.yaml artifacts/coverage - python3 .gitlab-ci/scripts/report_pass.py -generated_xif_tests: +.generated_xif_tests: extends: - .verif_test variables: DASHBOARD_SORT_INDEX: 12 DASHBOARD_JOB_CATEGORY: "Code Coverage" + SPIKE_TANDEM: 1 + DV_SIMULATORS: "vcs-uvm" parallel: matrix: - list_num: 1 @@ -422,6 +481,8 @@ directed_isacov-tests: variables: DASHBOARD_SORT_INDEX: 13 DASHBOARD_JOB_CATEGORY: "Functional Coverage" + SPIKE_TANDEM: 1 + DV_SIMULATORS: "vcs-uvm" parallel: matrix: - list_num: 0 @@ -442,11 +503,12 @@ csr_embedded_tests: DASHBOARD_SORT_INDEX: 15 DASHBOARD_JOB_CATEGORY: "CSR tests" DV_SIMULATORS: "vcs-uvm" + SPIKE_TANDEM: 1 script: - mkdir -p artifacts/coverage - source verif/regress/dv-csr-embedded-tests.sh - mv verif/sim/vcs_results/default/vcs.d/simv.vdb artifacts/coverage - - python3 .gitlab-ci/scripts/report_pass.py + - python3 .gitlab-ci/scripts/report_tandem.py verif/sim/out*/"$DV_SIMULATORS"_sim .backend_test: stage: backend tests @@ -463,8 +525,7 @@ simu-gate: - asic-synthesis parallel: matrix: - - SIMU_PERIOD: ["20"] # 50 Mhz - PERIOD: ["15"] # 66 Mhz + - PROG_NAME: ["dhrystone_smoke"] variables: DASHBOARD_JOB_TITLE: "Gate Level Simulation $DV_TARGET" DASHBOARD_JOB_DESCRIPTION: "Tests to check netlist from ASIC synthesis and power consumption over different patterns" @@ -472,24 +533,36 @@ simu-gate: DASHBOARD_JOB_CATEGORY: "Post Synthesis" DV_TARGET: cv32a65x TARGET: $DV_TARGET + TOP: "cva6" + SPIKE_TANDEM: 1 + SIMU_PERIOD: "20" # 50 Mhz + PERIOD: "15" # 66 Mhz script: + - mkdir -p artifacts/{reports,logs} - git -C verif/core-v-verif fetch --unshallow - - mkdir -p tools - - mv artifacts/tools/spike tools - - echo $SYN_VCS_BASHRC; source $SYN_VCS_BASHRC + - !reference [.copy_spike_artifacts] - echo $PERIOD - source ./verif/sim/setup-env.sh - - git clone ${SYNTH_SCRIPT} ${SYNTH_SCRIPT_PATH} + - git clone ${SYNTH_SCRIPT} ${SYNTH_SCRIPT_PATH} -b testelf + - git -C ${SYNTH_SCRIPT_PATH} checkout cb92f846 - cp -r ${SYNTH_SCRIPT_PATH}/cva6/ ../ - git apply ${SYNTH_SCRIPT_PATH}/patches/*.patch - source verif/regress/install-riscv-tests.sh - - mv artifacts/cva6_${DV_TARGET} pd/synth/ - - mv artifacts/cva6_${DV_TARGET}_synth.v pd/synth/ - - mv artifacts/cva6_${DV_TARGET}_synth.sdf pd/synth/ - - mkdir -p pd/synth/cva6_${DV_TARGET}/outputs/ - - python3 ${SYNTH_SCRIPT_PATH}/scharm -p configs/modules/CVA6.yml --runner=True --compaign="simu-gate" --name=$PROG_NAME + - mv artifacts/${TOP}_${DV_TARGET} pd/synth/ + - mv artifacts/${TOP}_${DV_TARGET}_synth.v pd/synth/ + - mv artifacts/${TOP}_${DV_TARGET}_synth.sdf pd/synth/ + - mkdir -p pd/synth/${TOP}_${DV_TARGET}/outputs/ + - export DV_SIMULATORS="spike" + - bash verif/regress/${PROG_NAME}.sh + - cp verif/sim/out_*/directed_tests/*.o verif/sim/testelf.o + - python3 ${SYNTH_SCRIPT_PATH}/scharm -p configs/modules/CVA6.yml --runner=True --compaign="simu-gate" --name=testelf + - grep "Simulation terminated" verif/sim/out_*/*/*.log.iss - mv ${SYNTH_SCRIPT_PATH}/artifacts/ artifacts/artifacts_gate/ - after_script: *simu_after_script + - rm artifacts/artifacts_gate/*/build/*.fsdb + - mkdir -p verif/sim/out_reports + - mkdir -p artifacts/sim_artifacts + - for i in verif/sim/out*/vcs-uvm-gate*/*; do cp $i $(dirname $(dirname $i))/vcs-uvm_sim/gate.$(basename $i); done + - python3 .gitlab-ci/scripts/report_tandem.py verif/sim/out*/vcs-uvm_sim fpga-boot: extends: @@ -520,7 +593,7 @@ code_coverage-report: needs: - generated_tests - directed_isacov-tests - - generated_xif_tests +# - generated_xif_tests - csr_embedded_tests variables: DASHBOARD_JOB_TITLE: "Report merge coverage" diff --git a/.gitlab-ci/expected_synth.yml b/.gitlab-ci/expected_synth.yml index d9fa51864e..98a2aeb83f 100644 --- a/.gitlab-ci/expected_synth.yml +++ b/.gitlab-ci/expected_synth.yml @@ -1,2 +1,2 @@ cv32a65x: - gates: 129171 + gates: 178869 diff --git a/.gitlab-ci/scripts/report_benchmark.py b/.gitlab-ci/scripts/report_benchmark.py index c471190956..7f781c8046 100644 --- a/.gitlab-ci/scripts/report_benchmark.py +++ b/.gitlab-ci/scripts/report_benchmark.py @@ -7,7 +7,9 @@ # # Original Author: Côme Allart +import os import sys +import re import report_builder as rb path = None @@ -17,54 +19,57 @@ # Keep it up-to-date with compiler version and core performance improvements # Will fail if the number of cycles is different from this one valid_cycles = { - 'dhrystone': 217900, - 'coremark': 686072, + "dhrystone_dual": 20199, + "dhrystone_single": 25019, + "coremark_dual": 1017451, + "coremark_single": 1308656, + "dhrystone_cv32a65x": 32566, + "dhrystone_cv32a60x": 39994, } for arg in sys.argv[1:]: - if arg == '--dhrystone': - mode = 'dhrystone' - # Standard value for Dhrystone - iterations = 500 - elif arg == '--coremark': - mode = 'coremark' - # Defined in verif/regress/coremark.sh - iterations = 2 + if "--dhrystone" in arg or "--coremark" in arg: + if "--dhrystone" in arg: + iterations = 50 + else: + if "--coremark" in arg: + iterations = 4 + mode = arg.replace("-", "") else: path = arg # We do not want to have a report without a check assert mode is not None -with open(path, 'r') as f: +with open(path, "r") as f: log = [l.strip() for l in f.readlines()] stopwatch = [] for index, line in enumerate(log): - if line.split()[-1] == 'mcycle' or line.split()[-2] == 'mcycle,': + if line.split()[-1] == "mcycle" or line.split()[-2] == "mcycle,": stopwatch.append(int(log[index + 1].split()[-1], 16)) # There might be > 2 matches, we use the two at the center N = len(stopwatch) assert N % 2 == 0 -cycles = stopwatch[N//2] - stopwatch[N//2-1] +cycles = stopwatch[N // 2] - stopwatch[N // 2 - 1] -score_metric = rb.TableMetric('Performance results') -score_metric.add_value('cycles', cycles) +score_metric = rb.TableMetric("Performance results") +score_metric.add_value("cycles", cycles) if iterations is not None: ipmhz = iterations * 1000000 / cycles - if mode == 'dhrystone': - score_metric.add_value('Dhrystone/MHz', ipmhz) - score_metric.add_value('DMIPS/MHz', ipmhz / 1757) - if mode == 'coremark': - score_metric.add_value('CoreMark/MHz', ipmhz) + if "dhrystone" in mode: + score_metric.add_value("Dhrystone/MHz", ipmhz) + score_metric.add_value("DMIPS/MHz", ipmhz / 1757) + if "coremark" in mode: + score_metric.add_value("CoreMark/MHz", ipmhz) diff = cycles - valid_cycles[mode] if diff != 0: score_metric.fail() - score_metric.add_value('Cycles diff', diff) + score_metric.add_value("Cycles diff", diff) -report = rb.Report(f'{cycles//1000} kCycles') +report = rb.Report(f"{cycles//1000} kCycles") report.add_metric(score_metric) report.dump() diff --git a/.gitlab-ci/scripts/report_builder.py b/.gitlab-ci/scripts/report_builder.py index e932ad3ade..a07c735db0 100644 --- a/.gitlab-ci/scripts/report_builder.py +++ b/.gitlab-ci/scripts/report_builder.py @@ -172,12 +172,19 @@ def to_doc(self): def dump(self, path=None): """ - Create report file + Print results and create report file By default the output path is build from $CI_JOB_NAME """ + for metric in self.metrics: + print(metric.values) + if path is None: - filename = re.sub(r'[^\w\.\\\/]', '_', os.environ["CI_JOB_NAME"]) - path = 'artifacts/reports/'+filename+'.yml' - with open(path, 'w') as f: - yaml.dump(self.to_doc(), f) + ci_job_name = os.environ.get("CI_JOB_NAME") + if ci_job_name is not None: + filename = re.sub(r'[^\w\.\\\/]', '_', ci_job_name) + path = 'artifacts/reports/'+filename+'.yml' + + if path is not None: + with open(path, 'w') as f: + yaml.dump(self.to_doc(), f) diff --git a/.gitlab-ci/scripts/report_fpga.py b/.gitlab-ci/scripts/report_fpga.py index 88a12e9e48..a1f48c7fa6 100644 --- a/.gitlab-ci/scripts/report_fpga.py +++ b/.gitlab-ci/scripts/report_fpga.py @@ -15,9 +15,6 @@ with open(str(sys.argv[1]), "r") as f: log = f.read() -with open(str(sys.argv[2]), "r") as f: - outputlog = f.read() - pattern = re.compile( "\|(?P +)(?P[\w()\[\].]+) +\| +(?P[\w()\[\].]+) \| +(?P\d+) \| +(?P\d+) \| +(?P\d+) \| +(?P\d+) \| +(?P\d+) \| +(?P\d+) \| +(?P\d+) \| +(?P\d+) \|" ) @@ -50,8 +47,5 @@ i["DSP48Blocks"] + " DSP48Blocks", ) -log_metric = rb.LogMetric("Last lines of logfile") -log_metric.values = outputlog.splitlines() - -report.add_metric(metric, log_metric) +report.add_metric(metric) report.dump() diff --git a/.gitlab-ci/scripts/report_spyglass_lint.py b/.gitlab-ci/scripts/report_spyglass_lint.py index 2142a58423..c1635f1bed 100644 --- a/.gitlab-ci/scripts/report_spyglass_lint.py +++ b/.gitlab-ci/scripts/report_spyglass_lint.py @@ -11,7 +11,6 @@ import re import sys - import report_builder as rb @@ -69,15 +68,17 @@ def compare_summaries(baseline_info, new_info): message = ( f"Count changed from {baseline_dict[key][0]} to {new_dict[key][0]}" ) - comparison_results.append((*key, *value, "PASS", message)) + if key[0] == "ERROR" and new_dict[key][0] > baseline_dict[key][0]: + comparison_results.append((*key, *value, "FAIL", message)) + else: + comparison_results.append((*key, *value, "PASS", message)) severity_order = {"ERROR": 1, "WARNING": 2, "INFO": 3} comparison_results.sort(key=lambda x: severity_order[x[0]]) - return comparison_results -def report_spyglass_lint(comparison_results): +def generate_spyglass_lint_report(comparison_results): metric = rb.TableStatusMetric("") metric.add_column("SEVERITY", "text") metric.add_column("RULE NAME", "text") @@ -94,7 +95,10 @@ def report_spyglass_lint(comparison_results): report = rb.Report() report.add_metric(metric) - report.dump() + + for value in metric.values: + print(" | ".join(map(str, value))) + return report if __name__ == "__main__": @@ -107,4 +111,8 @@ def report_spyglass_lint(comparison_results): baseline_info = extract_info(summary_ref_results) new_info = extract_info(summary_rpt) comparison_results = compare_summaries(baseline_info, new_info) - report_spyglass_lint(comparison_results) + report = generate_spyglass_lint_report(comparison_results) + print(report.failed) + report.dump() + if report.failed: + sys.exit(1) diff --git a/.gitlab-ci/scripts/report_tandem.py b/.gitlab-ci/scripts/report_tandem.py new file mode 100644 index 0000000000..c062a9f420 --- /dev/null +++ b/.gitlab-ci/scripts/report_tandem.py @@ -0,0 +1,104 @@ +# Copyright 2024 Thales DIS France SAS +# +# Licensed under the Solderpad Hardware Licence, Version 0.51 (the "License"); +# you may not use this file except in compliance with the License. +# SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 +# You may obtain a copy of the License at https://solderpad.org/licenses/ +# +# Original Author: Valentin Thomazic (valentin.thomazic@thalesgroup.com) + +import sys +import report_builder +import os +import glob +import yaml + + +def main(): + with_logs = os.environ.get("COLLECT_SIMU_LOGS") != None + metrics_table = report_builder.TableStatusMetric('') + + check_provided_args() + add_table_legend(metrics_table, with_logs) + passed_tests_count, total_tests_count = fill_table(sys.argv[1], metrics_table, with_logs) + + if not report(metrics_table, passed_tests_count, total_tests_count): + sys.exit(1) + + +def check_provided_args(): + if len(sys.argv) != 2: + sys.exit("Usage : python report_tandem.py path/to/log/dir") + + if not os.path.exists(sys.argv[1]): + sys.exit("No valid log directory provided!") + + if len(list(glob.iglob(sys.argv[1] + "/*.yaml"))) == 0: + sys.exit("No reports in log directory!") + + +def add_table_legend(metrics_table, with_logs): + metrics_table.add_column("TARGET", "text") + metrics_table.add_column("ISA", "text") + metrics_table.add_column("TEST", "text") + metrics_table.add_column("TEST LIST", "text") + metrics_table.add_column("SIMULATOR", "text") + metrics_table.add_column("MISMATCHES", "text") + + if with_logs: + metrics_table.add_column("OUTPUT", "log") + metrics_table.add_column("TB LOGS", "log") + metrics_table.add_column("DISASSEMBLY", "log") + + +def fill_table(reports_dir, metrics_table, with_logs): + simulation_reports = glob.iglob(reports_dir + "/*.yaml") + test_passed = 0 + test_count = 0 + + for report in simulation_reports: + test_passed += add_test_row(report, metrics_table, with_logs) + test_count += 1 + if test_passed != test_count: + metrics_table.fail() + return test_passed, test_count + + +def add_test_row(report_file, metrics_table, with_logs): + try: + with open(report_file) as f: + report = yaml.safe_load(f) + mismatches_count = str(report["mismatches_count"]) if "mismatches_count" in report else "Not found" + + row = [report["target"], report["isa"], report["test"], report["testlist"], report["simulator"], mismatches_count] + + if with_logs: + logs_path = "logs/" + os.environ.get("CI_JOB_ID") + "/artifacts/logs/" + output_log = logs_path + "logfile.log.head" + log_prefix = logs_path + report['test'] + "_" + str(report["iteration"]) + "." + report["target"] \ + if "iteration" in report else logs_path + report['test'] + "." + report["target"] + tb_log = log_prefix + '.log.iss.head' + disassembly = log_prefix + '.log.csv.head' + + row.append(output_log) + row.append(tb_log) + row.append(disassembly) + + if report["exit_cause"] == "SUCCESS" and report["exit_code"] == 0: + metrics_table.add_pass(*row) + return 1 + + metrics_table.add_fail(*row) + return 0 + except (TypeError, KeyError): + sys.exit("Invalid yaml file in log directory! Is the log directory correct?") + +def report(metrics_table, passed_test_count, total_test_count): + report = report_builder.Report(f'{passed_test_count}/{total_test_count}') + report.add_metric(metrics_table) + report.dump() + return not report.failed + + +if __name__ == "__main__": + main() diff --git a/.readthedocs.yaml b/.readthedocs.yaml index 3c51bfed3c..957bd7983f 100644 --- a/.readthedocs.yaml +++ b/.readthedocs.yaml @@ -8,6 +8,14 @@ build: os: "ubuntu-20.04" tools: python: "3.9" + nodejs: "20" + ruby: "3.3" + jobs: + post-install: + - npm install docs/riscv-isa/riscv-isa-manual/dependencies + - gem install -g docs/riscv-isa/riscv-isa-manual/dependencies/Gemfile + pre-build: + - make -C docs prepare # Build from the docs directory with Sphinx sphinx: diff --git a/Bender.yml b/Bender.yml index 56f3b8135b..5ef388dad7 100644 --- a/Bender.yml +++ b/Bender.yml @@ -76,7 +76,6 @@ sources: - core/include/std_cache_pkg.sv # Extension Interface - - core/include/cvxif_pkg.sv - core/cvxif_example/include/cvxif_instr_pkg.sv - core/cvxif_fu.sv - core/cvxif_example/cvxif_example_coprocessor.sv @@ -110,7 +109,6 @@ sources: - core/ariane_regfile_ff.sv - core/ariane_regfile_fpga.sv - core/scoreboard.sv - - core/round_interval.sv - core/store_buffer.sv - core/amo_buffer.sv - core/store_unit.sv diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 15c56beeb5..51b182403c 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -61,6 +61,12 @@ See the [Git Cheats](https://github.com/openhwgroup/core-v-verif/blob/master/Git 6. Push feature branch: `git push origin ` 7. From GitHub: submit a pull request +Please note that we do not accept outdated pull requests. +This makes sure the CI flow has run in the to-be version of the master. + +To allow us to update the pull request before merging it, please consider checking the "Allow edits from maintainers" checkbox. +Note that this can only be done with pull requests from your personal repository (it is impossible from organization repositories). + ## Coding Style For RTL coding, the OpenHW Group has adopted the [lowRISC Style Guides](https://github.com/lowRISC/style-guides/). diff --git a/Flist.ariane b/Flist.ariane index 8163532785..12b4629404 100644 --- a/Flist.ariane +++ b/Flist.ariane @@ -94,7 +94,6 @@ core/mmu_sv39x4/ptw_sv39x4.sv core/ariane_regfile_ff.sv core/re_name.sv core/scoreboard.sv -core/round_interval.sv core/store_buffer.sv core/amo_buffer.sv core/store_unit.sv diff --git a/Makefile b/Makefile index ea6920f168..9731d901b0 100644 --- a/Makefile +++ b/Makefile @@ -235,7 +235,7 @@ uart_src_sv:= corev_apu/fpga/src/apb_uart/src/slib_clock_div.sv \ corev_apu/fpga/src/apb_uart/src/apb_uart_wrap.sv uart_src_sv := $(addprefix $(root-dir), $(uart_src_sv)) -fpga_src := $(wildcard corev_apu/fpga/src/*.sv) $(wildcard corev_apu/fpga/src/ariane-ethernet/*.sv) common/local/util/tc_sram_fpga_wrapper.sv vendor/pulp-platform/fpga-support/rtl/SyncSpRamBeNx64.sv +fpga_src := $(wildcard corev_apu/fpga/src/*.sv) $(wildcard corev_apu/fpga/src/ariane-ethernet/*.sv) common/local/util/tc_sram_fpga_wrapper.sv common/local/util/hpdcache_sram_1rw.sv common/local/util/hpdcache_sram_wbyteenable_1rw.sv vendor/pulp-platform/fpga-support/rtl/SyncSpRamBeNx64.sv vendor/pulp-platform/fpga-support/rtl/SyncSpRamBeNx32.sv vendor/pulp-platform/fpga-support/rtl/SyncSpRam.sv fpga_src := $(addprefix $(root-dir), $(fpga_src)) src/bootrom/bootrom_$(XLEN).sv # look for testbenches @@ -271,11 +271,18 @@ incdir := $(CVA6_REPO_DIR)/vendor/pulp-platform/common_cells/include/ $(CVA6_REP $(SPIKE_INSTALL_DIR)/include/disasm/ # Compile and sim flags -compile_flag += -incr -64 -nologo -quiet -suppress 13262 -suppress 8607 -permissive -svinputport=compat +define+$(defines) -suppress 8386 -suppress vlog-2577 +compile_flag += -incr -64 -nologo -quiet -suppress 13262 -suppress 8607 +permissive -svinputport=compat +define+$(defines) -suppress 8386 -suppress vlog-2577 vopt_flag += -suppress 2085 -suppress 7063 -suppress 2698 -suppress 13262 +ifdef config-file + spike-yaml-plusarg = +config_file=$(spike_yaml) +endif + uvm-flags += +UVM_NO_RELNOTES +UVM_VERBOSITY=UVM_LOW -questa-flags += -t 1ns -64 $(gui-sim) $(QUESTASIM_FLAGS) +tohost_addr=$(tohost_addr) +define+QUESTA -suppress 3356 -suppress 3579 +questa-flags += -t 1ns -64 $(gui-sim) $(QUESTASIM_FLAGS) \ + +tohost_addr=$(shell ${RISCV}/bin/${CV_SW_PREFIX}nm -B $(elf) | grep -w tohost | cut -d' ' -f1) \ + +core_name=$(target) +define+QUESTA -suppress 3356 -suppress 3579 +report_file=$(report_file) \ + $(spike-yaml-plusarg) compile_flag_vhd += -64 -nologo -quiet -2008 # Iterate over all include directories and write them with +incdir+ prefixed @@ -305,7 +312,7 @@ ifdef preload endif ifdef spike-tandem - questa-cmd += -gblso $(SPIKE_INSTALL_DIR)/lib/libriscv.so + questa-cmd += -gblso $(SPIKE_INSTALL_DIR)/lib/libyaml-cpp.so -gblso $(SPIKE_INSTALL_DIR)/lib/libriscv.so endif # remote bitbang is enabled @@ -332,13 +339,13 @@ vcs_build: $(dpi-library)/ariane_dpi.so vcs: vcs_build cd $(vcs-library) && \ ./simv +permissive $(if $(VERDI), -verdi -do $(root-dir)/init_testharness.do,) \ - +elf_file=$(elf_file) ++$(elf_file) $(if $(spike-tandem),-sv_lib $(SPIKE_INSTALL_DIR)/libriscv) \ + +elf_file=$(elf_file) ++$(elf_file) $(if $(spike-tandem), -sv_lib $(SPIKE_INSTALL_DIR)/libyaml-cpp) -sv_lib $(SPIKE_INSTALL_DIR)/libriscv \ -sv_lib ../work-dpi/ariane_dpi | tee vcs.log # Build the TB and module using QuestaSim build: $(library) $(library)/.build-srcs $(library)/.build-tb $(dpi-library)/ariane_dpi.so # Optimize top level - $(VOPT) -64 -work $(library) $(top_level) -o $(top_level)_optimized +acc -check_synthesis -dpilib $(SPIKE_INSTALL_DIR)/lib/libriscv -dpilib $(SPIKE_INSTALL_DIR)/lib/lifesvr $(vopt_flag) + $(VOPT) -64 -work $(library) $(top_level) -o $(top_level)_optimized +acc -check_synthesis -dpilib $(SPIKE_INSTALL_DIR)/lib/libriscv -dpilib $(SPIKE_INSTALL_DIR)/lib/libfesvr -dpilib $(SPIKE_INSTALL_DIR)/lib/libyaml-cpp $(vopt_flag) # src files $(library)/.build-srcs: $(library) @@ -366,13 +373,13 @@ $(dpi-library)/%.o: corev_apu/tb/dpi/%.cc $(dpi_hdr) $(dpi-library)/ariane_dpi.so: $(dpi) mkdir -p $(dpi-library) # Compile C-code and generate .so file - $(CXX) -shared -m64 -o $(dpi-library)/ariane_dpi.so $? -L$(RISCV)/lib -L$(SPIKE_INSTALL_DIR)/lib -Wl,-rpath,$(RISCV)/lib -Wl,-rpath,$(SPIKE_INSTALL_DIR)/lib -lfesvr -lriscv + $(CXX) -shared -m64 -o $(dpi-library)/ariane_dpi.so $? -L$(RISCV)/lib -L$(SPIKE_INSTALL_DIR)/lib -Wl,-rpath,$(RISCV)/lib -Wl,-rpath,$(SPIKE_INSTALL_DIR)/lib -lfesvr -lriscv -lyaml-cpp $(dpi-library)/xrun_ariane_dpi.so: $(dpi) # Make Dir work-dpi mkdir -p $(dpi-library) # Compile C-code and generate .so file - $(CXX) -shared -m64 -o $(dpi-library)/xrun_ariane_dpi.so $? -L$(RISCV)/lib -L$(SPIKE_INSTALL_DIR)/lib -Wl,-rpath,$(RISCV)/lib -Wl,-rpath,$(SPIKE_INSTALL_DIR)/lib -lfesvr -lriscv + $(CXX) -shared -m64 -o $(dpi-library)/xrun_ariane_dpi.so $? -L$(RISCV)/lib -L$(SPIKE_INSTALL_DIR)/lib -Wl,-rpath,$(RISCV)/lib -Wl,-rpath,$(SPIKE_INSTALL_DIR)/lib -lfesvr -lriscv -lyaml-cpp # single test runs on Questa can be started by calling make , e.g. make towers.riscv # the test names are defined in ci/riscv-asm-tests.list, and in ci/riscv-benchmarks.list @@ -384,8 +391,8 @@ generate-trace-vsim: sim: build $(VSIM) +permissive $(questa-flags) $(questa-cmd) -lib $(library) +MAX_CYCLES=$(max_cycles) +UVM_TESTNAME=$(test_case) \ - +BASEDIR=$(riscv-test-dir) $(uvm-flags) -sv_lib $(SPIKE_INSTALL_DIR)/lib/libriscv -sv_lib $(SPIKE_INSTALL_DIR)/lib/libfesvr \ - -sv_lib $(SPIKE_INSTALL_DIR)/lib/libdisasm \ + +BASEDIR=$(riscv-test-dir) $(uvm-flags) -sv_lib $(SPIKE_INSTALL_DIR)/lib/libyaml-cpp -sv_lib $(SPIKE_INSTALL_DIR)/lib/libriscv -sv_lib $(SPIKE_INSTALL_DIR)/lib/libfesvr \ + -sv_lib $(SPIKE_INSTALL_DIR)/lib/libdisasm \ ${top_level}_optimized +permissive-off +elf_file=$(elf_file) ++$(elf_file) ++$(target-options) $(riscv-asm-tests): build @@ -616,7 +623,7 @@ verilate_command := $(verilator) --no-timing verilator_config.vlt $(if $(DEBUG), --trace-structs,) \ $(if $(TRACE_COMPACT), --trace-fst $(VL_INC_DIR)/verilated_fst_c.cpp) \ $(if $(TRACE_FAST), --trace $(VL_INC_DIR)/verilated_vcd_c.cpp) \ - -LDFLAGS "-L$(RISCV)/lib -L$(SPIKE_INSTALL_DIR)/lib -Wl,-rpath,$(RISCV)/lib -Wl,-rpath,$(SPIKE_INSTALL_DIR)/lib -lfesvr -lriscv -ldisasm $(if $(PROFILE), -g -pg,) -lpthread $(if $(TRACE_COMPACT), -lz,)" \ + -LDFLAGS "-L$(RISCV)/lib -L$(SPIKE_INSTALL_DIR)/lib -Wl,-rpath,$(RISCV)/lib -Wl,-rpath,$(SPIKE_INSTALL_DIR)/lib -lfesvr -lriscv -ldisasm -lyaml-cpp $(if $(PROFILE), -g -pg,) -lpthread $(if $(TRACE_COMPACT), -lz,)" \ -CFLAGS "$(CFLAGS)$(if $(PROFILE), -g -pg,) -DVL_DEBUG -I$(SPIKE_INSTALL_DIR)" \ $(if $(SPIKE_TANDEM), +define+SPIKE_TANDEM, ) \ --cc --vpi \ @@ -724,6 +731,11 @@ fpga_filter += $(addprefix $(root-dir), src/util/instr_trace_item.sv) fpga_filter += $(addprefix $(root-dir), common/local/util/instr_tracer.sv) fpga_filter += $(addprefix $(root-dir), vendor/pulp-platform/tech_cells_generic/src/rtl/tc_sram.sv) fpga_filter += $(addprefix $(root-dir), common/local/util/tc_sram_wrapper.sv) +fpga_filter += $(addprefix $(root-dir), corev_apu/tb/ariane_peripherals.sv) +fpga_filter += $(addprefix $(root-dir), corev_apu/tb/ariane_testharness.sv) +fpga_filter += $(addprefix $(root-dir), core/cache_subsystem/hpdcache/rtl/src/common/macros/behav/hpdcache_sram_1rw.sv) +fpga_filter += $(addprefix $(root-dir), core/cache_subsystem/hpdcache/rtl/src/common/macros/behav/hpdcache_sram_wbyteenable_1rw.sv) +fpga_filter += $(addprefix $(root-dir), core/cache_subsystem/hpdcache/rtl/src/common/macros/behav/hpdcache_sram_wmask_1rw.sv) src/bootrom/bootrom_$(XLEN).sv: $(MAKE) -C corev_apu/fpga/src/bootrom BOARD=$(BOARD) XLEN=$(XLEN) bootrom_$(XLEN).sv diff --git a/README.md b/README.md index 20be84f1eb..758a93fe14 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,4 @@ -![Build Status](https://github.com/openhwgroup/cva6/actions/workflows/ci.yml/badge.svg?branch=master) - - -# CVA6 RISC-V CPU +# CVA6 RISC-V CPU [![Build Status](https://github.com/openhwgroup/cva6/actions/workflows/ci.yml/badge.svg?branch=master)](https://github.com/openhwgroup/cva6/actions/workflows/ci.yml) [![CVA6 dashboard](https://riscv-ci.pages.thales-invia.fr/dashboard/badge.svg)](https://riscv-ci.pages.thales-invia.fr/dashboard/) [![GitHub release](https://img.shields.io/github/release/openhwgroup/cva6?include_prereleases=&sort=semver&color=blue)](https://github.com/openhwgroup/cva6/releases/) CVA6 is a 6-stage, single-issue, in-order CPU which implements the 64-bit RISC-V instruction set. It fully implements I, M, A and C extensions as specified in Volume I: User-Level ISA V 2.3 as well as the draft privilege extension 1.10. It implements three privilege levels M, S, U to fully support a Unix-like operating system. Furthermore, it is compliant to the draft external debug spec 0.13. @@ -31,12 +28,14 @@ git submodule update --init --recursive :warning: It is **strongly recommended** to use the toolchain built with the provided scripts. -3. Set the RISCV environment variable. +3. Install `cmake`, version 3.14 or higher. + +4. Set the RISCV environment variable. ```sh export RISCV=/path/to/toolchain/installation/directory ``` -4. Install `help2man` and `device-tree-compiler` packages. +5. Install `help2man` and `device-tree-compiler` packages. For Debian-based Linux distributions, run : @@ -44,13 +43,13 @@ For Debian-based Linux distributions, run : sudo apt-get install help2man device-tree-compiler ``` -5. Install the riscv-dv requirements: +6. Install the riscv-dv requirements: ```sh pip3 install -r verif/sim/dv/requirements.txt ``` -6. Run these commands to install a custom Spike and Verilator (i.e. these versions must be used to simulate the CVA6) and [these](#running-regression-tests-simulations) tests suites. +7. Run these commands to install a custom Spike and Verilator (i.e. these versions must be used to simulate the CVA6) and [these](#running-regression-tests-simulations) tests suites. ```sh # DV_SIMULATORS is detailed in the next section export DV_SIMULATORS=veri-testharness,spike @@ -93,7 +92,7 @@ cd ./verif/sim python3 cva6.py --target cv32a60x --iss=$DV_SIMULATORS --iss_yaml=cva6.yaml \ --c_tests ../tests/custom/hello_world/hello_world.c \ ---linker=../tests/custom/common/test.ld \ +--linker=../../config/gen_from_riscv_config/linker/link.ld \ --gcc_opts="-static -mcmodel=medany -fvisibility=hidden -nostdlib \ -nostartfiles -g ../tests/custom/common/syscalls.c \ ../tests/custom/common/crt.S -lgcc \ @@ -413,6 +412,10 @@ If you use CVA6 in your academic work you can cite us:
+# Resources and Ecosystem + +The CVA6 core is part of a vivid ecosystem. In [this document](RESOURCES.md), we gather pointers to this ecosystem (building blocks, designs, partners...) + # Acknowledgements Check out the [acknowledgements](ACKNOWLEDGEMENTS.md). diff --git a/RESOURCES.md b/RESOURCES.md new file mode 100644 index 0000000000..2f44b631d8 --- /dev/null +++ b/RESOURCES.md @@ -0,0 +1,96 @@ +# CVA6 Ecosystem and Resources + +The CORE-V CVA6 core is part of a large open-source ecosystem. In this page, we collect pointers to this ecosystem, so that CVA6 users can find their way. + +Please help improve this page, by filing an [issue](https://github.com/openhwgroup/cva6/issues) or a [pull request](https://github.com/openhwgroup/cva6/pulls). For pull requests, you need to sign the [Eclipse Contributor Agreement](https://www.eclipse.org/legal/ECA.php). + +> [!NOTE] +> We only collect here pointers to resources that are mature enough to be used by external users. +> Resources that reach the TRL-5 maturity (ready to integrate into productions ICs) are clearly mentioned. +> Otherwise, you can assume a TRL-4 maturity. + +> [!WARNING] +> The CVA6 team is not liable for the other repositories. +> Assess their content and make sure they fit your needs and are mature enough for design. +> Plese direct your issues or pull requests to these external repositories. + +## Our legacy + +CVA6 was designed by the [PULP Platform team](https://www.pulp-platform.org/). You can integrate it with many other PULP designs from [github.com/pulp-platform](https://github.com/pulp-platform). + +## Technical resources + +### SW Tools and OSes + +RISC-V tools for CVA6 and Buildroot Linux support are available [here](https://github.com/openhwgroup/cva6-sdk). + +Yocto Linux support for CVA6 is available [here](https://github.com/openhwgroup/meta-cva6-yocto). + +FreeRTOS support for CVA6 is available [here](https://github.com/FreeRTOS/FreeRTOS-Partner-Supported-Demos/tree/main/RISC-V_cva6). + +Zephyr support for CV64A6 will soon be available. + +This [tutorial](https://github.com/ThalesGroup/cva6-eclipse-demo) offers resources to debug CVA6 under Eclipse IDE. + +The OS ports below are on Digilent Genesys 2 board. + +### Related building blocks + +These building blocks fit very nicely with CVA6: + +- [OpenPiton](https://github.com/PrincetonUniversity/openpiton) is a many-core framework that supports CVA6. +- [Culsans/CV-TCCC](https://github.com/pulp-platform/culsans) is a multi-core infrastructure for a few CVA6 cores. +- [ARA/CV-VEC](https://github.com/pulp-platform/ara) is a vector unit for CVA6. +- [HPDcache](https://github.com/openhwgroup/cv-hpdcache) is a flexible (highly configurable) and high-throughput L1 cache. + +### Design examples (FPGA) + +The CVA6 repository contains the CVA6 core and a basic CPU design, the "APU" and its implementation on a Digilent Genesys 2 FPGA board. Here is a list of other CVA6-based FPGA designs: + +The [technical kits](https://github.com/thalesgroup/cva6-softcore-contest) of a student contest organized in France can be used as educational resources or as an easy way to get CVA6 up and running with a cheaper Digilent Zybo Z7-20 board. You will find in it: +- The 2020-2021 contest, focusing on PPA optimization; +- The 2021-2022 contest, focusing on energy optimization; +- The 2022-2023 contest, focusing on cybersecurity, including a port of Zephyr OS; +- The 2023-2024 contest, focusing on the acceleration of the MNIST digit recognition with custom extensions; +- The 2024-2025 contest, focusing on the frequency increase (_not released yet_); +- A treat with the support of Linux and a VGA output. + +[CVA6 with Xilinx Ethernet](https://github.com/cispa/CVA6-Vivado-Project-with-Xilinx-AXI-Ethernet/) is an alternative design which implements Xilinx 1G/2.5G Ethernet Subsystem on the Digilent Genesys 2 FPGA board. It has been tested with TFTP boot in u-boot and SSH in Linux. + +### Designs (ASIC) + +Here are open-source ASIC designs based on CVA6: + +[Polara APU](https://github.com/openhwgroup/core-v-polara-apu) is a 4-core processor made with OpenPiton, ARA and CVA6. + +To be completed + +## Business resources + +### Service offer + +These companies are OpenHW members, have a good CVA6 knowledge, and offer CVA6-related service: + +**Zero-Day Labs** provides design and development services primarily related to embedded software/firmware security and hardware (RTL) for RISC-V. +With major contributions in the scope of RISC-V virtualization, the company has developed and maintains the RISC-V Hypervisor extension in CVA6 +and has recently made open-source the RISC-V AIA and IOMMU IPs. +Contact: [geral@zero-day-labs.com](mailto:geral@zero-day-labs.com). + +RISC-V made easy - experienced ASIC/FPGA service providers, [**PlanV**](https://planv.tech/) will help you navigate the IP landscape, +optimize your design workflows, and bring your RISC-V chip to life. + +[**MU-Electronics**](https://www.mu-e.com/) is a services company having its design center in Rabat-Morocco since 2003, specialized in ICs design from datasheet to GDSII generation +(RTL design, DFT, verification, full custom layout, place and route), firmware, driver & application development, test & validation, security implementation & support to certification. +MU-E has designed IPs and Chips down to 7nm. MU-E is participating in the European TRISTAN project and working for Thales on the verification of CVA6. + +[**10xEngineers**](https://10xengineers.ai/) is a design and verification services company focused on RISC-V. We contribute to compiler enablement, RTL design, +and verification efforts within the OpenHW ecosystem. Our work on CVA6 includes architectural and microarchitectural verification of MMU +and implementation of multiple RISC-V extensions, such as Bitmanip, Zicond, Zcb, and Zcmp. Our expert team assists companies in integrating, +customizing, and optimizing CVA6 to meet their unique requirements. + + _(To be completed based on companies's requests. Max 1 URL and 60 words per company)_ + +### Product ICs + +If you have integrated CVA6 into a production IC, we'd like to hear from you and mention it here. + diff --git a/ci/install-prereq.sh b/ci/install-prereq.sh index fbac904700..0f99c37f57 100644 --- a/ci/install-prereq.sh +++ b/ci/install-prereq.sh @@ -1,7 +1,7 @@ #!/bin/bash echo 'deb http://download.opensuse.org/repositories/home:/phiwag:/edatools/xUbuntu_20.04/ /' | sudo tee /etc/apt/sources.list.d/home:phiwag:edatools.list -curl -fsSL https://download.opensuse.org/repositories/home:phiwag:edatools/xUbuntu_20.04/Release.key | gpg --dearmor | sudo tee /etc/apt/trusted.gpg.d/home_phiwag_edatools.gpg > /dev/null +curl -fsSL https://download.opensuse.org/repositories/home:phiwag:edatools/Debian_Unstable/Release.key | gpg --dearmor | sudo tee /etc/apt/trusted.gpg.d/home_phiwag_edatools.gpg > /dev/null sudo apt update sudo apt install device-tree-compiler libfl-dev help2man diff --git a/common/local/util/hpdcache_sram_1rw.sv b/common/local/util/hpdcache_sram_1rw.sv new file mode 100644 index 0000000000..7893a4bad1 --- /dev/null +++ b/common/local/util/hpdcache_sram_1rw.sv @@ -0,0 +1,46 @@ +// Copyright 2025 Thales DIS France SAS +// +// Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0 +// You may obtain a copy of the License at https://solderpad.org/licenses/ +// +// Original Author: Yannick Casamatta - Thales +// Date: 22/10/2024 + +module hpdcache_sram_1rw +#( + parameter int unsigned ADDR_SIZE = 0, + parameter int unsigned DATA_SIZE = 0, + parameter int unsigned DEPTH = 2**ADDR_SIZE +) +( + input logic clk, + input logic rst_n, + input logic cs, + input logic we, + input logic [ADDR_SIZE-1:0] addr, + input logic [DATA_SIZE-1:0] wdata, + output logic [DATA_SIZE-1:0] rdata +); + + SyncSpRam #( + .ADDR_WIDTH(ADDR_SIZE), + .DATA_DEPTH(DEPTH), // usually 2**ADDR_WIDTH, but can be lower + .DATA_WIDTH(DATA_SIZE), + .OUT_REGS (0), + .SIM_INIT (1) // for simulation only, will not be synthesized + // 0: no init, 1: zero init, 2: random init + // note: on verilator, 2 is not supported. define the VERILATOR macro to work around. + )SyncSpRam_i( + .Clk_CI (clk), + .Rst_RBI (rst_n), + .CSel_SI (cs), + .WrEn_SI (we), + .Addr_DI (addr), + .WrData_DI(wdata), + .RdData_DO(rdata) + ); + + +endmodule : hpdcache_sram_1rw diff --git a/common/local/util/hpdcache_sram_wbyteenable_1rw.sv b/common/local/util/hpdcache_sram_wbyteenable_1rw.sv new file mode 100644 index 0000000000..d9f2996101 --- /dev/null +++ b/common/local/util/hpdcache_sram_wbyteenable_1rw.sv @@ -0,0 +1,114 @@ +// Copyright 2025 Thales DIS France SAS +// +// Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0 +// You may obtain a copy of the License at https://solderpad.org/licenses/ +// +// Original Author: Yannick Casamatta - Thales +// Date: 22/10/2024 + +module hpdcache_sram_wbyteenable_1rw +#( + parameter int unsigned ADDR_SIZE = 0, + parameter int unsigned DATA_SIZE = 0, + parameter int unsigned DEPTH = 2**ADDR_SIZE +) +( + input logic clk, + input logic rst_n, + input logic cs, + input logic we, + input logic [ADDR_SIZE-1:0] addr, + input logic [DATA_SIZE-1:0] wdata, + input logic [DATA_SIZE/8-1:0] wbyteenable, + output logic [DATA_SIZE-1:0] rdata +); + +if (DATA_SIZE == 128) begin + // Découpage des données en deux moitiés de 64 bits + logic [DATA_SIZE/2-1:0] wdata_low, wdata_high; + logic [DATA_SIZE/2-1:0] rdata_low, rdata_high; + logic [7:0] be_low, be_high; + assign wdata_low = wdata[63:0]; + assign wdata_high = wdata[127:64]; + assign be_low = wbyteenable[7:0]; + assign be_high = wbyteenable[15:8]; + + SyncSpRamBeNx64 #( + .ADDR_WIDTH(ADDR_SIZE), + .DATA_DEPTH(DEPTH), + .OUT_REGS (0), + .SIM_INIT (1) + ) SyncSpRam_0 ( + .Clk_CI (clk), + .Rst_RBI (rst_n), + .CSel_SI (cs), + .WrEn_SI (we), // Ecriture sur la banque basse + .BEn_SI (be_low), + .Addr_DI (addr), + .WrData_DI(wdata_low), + .RdData_DO(rdata_low) + ); + + SyncSpRamBeNx64 #( + .ADDR_WIDTH(ADDR_SIZE), + .DATA_DEPTH(DEPTH), + .OUT_REGS (0), + .SIM_INIT (1) + ) SyncSpRam_1 ( + .Clk_CI (clk), + .Rst_RBI (rst_n), + .CSel_SI (cs), + .WrEn_SI (we), // Ecriture sur la banque haute + .BEn_SI (be_high), + .Addr_DI (addr), + .WrData_DI(wdata_high), + .RdData_DO(rdata_high) + ); + + assign rdata = {rdata_high, rdata_low}; + +end else if (DATA_SIZE == 64) begin + SyncSpRamBeNx64 #( + .ADDR_WIDTH(ADDR_SIZE), + .DATA_DEPTH(DEPTH), // usually 2**ADDR_WIDTH, but can be lower + .OUT_REGS (0), + .SIM_INIT (1) // for simulation only, will not be synthesized + // 0: no init, 1: zero init, 2: random init + // note: on verilator, 2 is not supported. define the VERILATOR macro to work around. + )SyncSpRam_i( + .Clk_CI (clk), + .Rst_RBI (rst_n), + .CSel_SI (cs), + .WrEn_SI (we), + .BEn_SI (wbyteenable), + .Addr_DI (addr), + .WrData_DI(wdata), + .RdData_DO(rdata) + ); +end else if (DATA_SIZE == 32) begin + SyncSpRamBeNx32 #( + .ADDR_WIDTH(ADDR_SIZE), + .DATA_DEPTH(DEPTH), // usually 2**ADDR_WIDTH, but can be lower + .OUT_REGS (0), + .SIM_INIT (1) // for simulation only, will not be synthesized + // 0: no init, 1: zero init, 2: random init + // note: on verilator, 2 is not supported. define the VERILATOR macro to work around. + )SyncSpRam_i( + .Clk_CI (clk), + .Rst_RBI (rst_n), + .CSel_SI (cs), + .WrEn_SI (we), + .BEn_SI (wbyteenable), + .Addr_DI (addr), + .WrData_DI(wdata), + .RdData_DO(rdata) + ); + +end else begin + $fatal(1, "DATASIZE=%d, in not supported " ,DATA_SIZE); +end + + +endmodule : hpdcache_sram_wbyteenable_1rw diff --git a/common/local/util/instr_tracer.sv b/common/local/util/instr_tracer.sv index 934509b2e7..9083664069 100644 --- a/common/local/util/instr_tracer.sv +++ b/common/local/util/instr_tracer.sv @@ -20,7 +20,7 @@ module instr_tracer #( parameter config_pkg::cva6_cfg_t CVA6Cfg = config_pkg::cva6_cfg_empty, parameter type bp_resolve_t = logic, - parameter type scoreboard_entry_t = logic, + parameter type scoreboard_entry_t = logic[303:0], // Fix for xcelium bug at runtime: does not have enough memory space reserved for scoreboard_entry parameter type interrupts_t = logic, parameter type exception_t = logic, parameter interrupts_t INTERRUPTS = '0 diff --git a/common/local/util/sram.sv b/common/local/util/sram.sv index f8dd934256..8c6c0d34d2 100644 --- a/common/local/util/sram.sv +++ b/common/local/util/sram.sv @@ -106,8 +106,8 @@ end // synthesis translate_off begin: i_tc_sram_wrapper_user begin: i_tc_sram - logic init_val; localparam type data_t = logic [63:0]; + data_t init_val [0:0]; data_t sram [NUM_WORDS-1:0] /* verilator public_flat */; end end diff --git a/config/gen_from_riscv_config/README.md b/config/gen_from_riscv_config/README.md index 4de91244fc..2422c973e0 100644 --- a/config/gen_from_riscv_config/README.md +++ b/config/gen_from_riscv_config/README.md @@ -34,22 +34,28 @@ pip3 install -r requirements.txt ```bash #Generate Restructred-text documentation for Control and Status Registers (CSR) -python3 .py -s <../riscv-config/Config_Name/generated/isa_gen>.yaml -c <../riscv-config/Config_Name/generated/custom_gen>.yaml-m .yaml -t < Config_Name> +python3 .py -s <../riscv-config/Config_Name/generated/isa_gen>.yaml -c <../riscv-config/Config_Name/generated/custom_gen>.yaml -d <../riscv-config/Config_Name/generated/debug_gen>.yaml -m .yaml -t < Config_Name> #Generate Restructred-text documentation for ISA extensions python3 .py -s <../riscv-config/Config_Name/generated/isa_gen>.yaml -i .yaml -m .yaml -t < Config_Name> +#Generate the Yaml spike configuration file +python3 .py -s <../riscv-config/Config_Name/generated/isa_gen>.yaml -c <../riscv-config/Config_Name/generated/custom_gen>.yaml -i .mako -m .yaml -t < Config_Name> + ``` ## Usage with cv32a65x ```bash #Generate the Restructred-text documentation for Control and Status Registers (CSR) -python3 scripts/riscv_config_gen.py -s ../riscv-config/cv32a65x/generated/isa_gen.yaml -c ../riscv-config/cv32a65x/generated/custom_gen.yaml -m updaters/cv32a65x/csr_updater.yaml -t cv32a65x +python3 scripts/riscv_config_gen.py -s ../riscv-config/cv32a65x/generated/isa_gen.yaml -c ../riscv-config/cv32a65x/generated/custom_gen.yaml -d ../riscv-config/cv32a65x/generated/debug_gen.yaml -m updaters/cv32a65x/csr_updater.yaml -t cv32a65x #Generate the Restructred-text documentation for ISA extensions python3 scripts/riscv_config_gen.py -s ../riscv-config/cv32a65x/generated/isa_gen.yaml -i templates/isa_template.yaml -m updaters/cv32a65x/isa_updater.yaml -t cv32a65x +#Generate the Yaml spike configuration file +python3 scripts/riscv_config_gen.py -s ../riscv-config/cv32a65x/generated/isa_gen.yaml -c ../riscv-config/cv32a65x/generated/custom_gen.yaml -i templates/spike.mako -m updaters/cv32a65x/spike_updater.yaml -t cv32a65x + ``` You could find your output files in this directory : @@ -59,12 +65,18 @@ if the output is ISA Documentation: if the output is CSR Documentation : `/csr/` + +if the output is Spike yaml : + `/spike/` + -for more details about How to write CSR or ISA Updater,see [UPDATERS](##Updaters) section +for more details about How to write CSR or ISA Updater,see [Updaters](#updaters) section + +for more details about How to write ISA template ,see [Annexes2](#annexes2) section -for more details about How to write ISA template ,see [Annexes2](##Annexes2) section +for more details about How to write spike template , see [mako](https://www.makotemplates.org/) section @@ -130,7 +142,7 @@ Example : ISA_Updater.yaml -If you want to modify any parameter for registers in RISC CONFIG YAML : - Format : +- Format : Register name : sub_feature : @@ -146,7 +158,7 @@ Example : ISA_Updater.yaml -If you want to exclude any registers base on condition : - Format : +- Format : exclude : @@ -155,7 +167,7 @@ Example : ISA_Updater.yaml sub_key : sub_value (if exist if not dont include it ) cond: value - Exemple : +- Exemple : exclude : @@ -168,12 +180,12 @@ Example : ISA_Updater.yaml Example : (PMPADDR , MHPMCOUNTER, ...) -Format : +- Format : Register Name : range : number -Exemple : +- Exemple : pmpaddr : @@ -184,6 +196,56 @@ CSR/ISA Updater read RISC-CONFIG.yaml and update the registers so if you want to +### SPIKE Updater + + +-If you want to modify any parameter Spike yaml: + +- Format : + + : + +- Example : + + + bootrom: false + +-If you want to to modify any parameter in core config in Spike yaml : + +- Format : + + cores: + + : +- Exemple : + + Bootroom : true +- Exemple : + + cores: + isa: rv32imc_zba_zbb_zbs_zbc_zicsr_zifencei + boot_addr: 0x80000000 + marchid: 0x3 + misa_we: false + misa_we_enable: true + pmpaddr0: 0x0 + pmpcfg0: 0x0 + pmpregions: 0x40 + usable_pmpregions : 0x8 + priv: M + status_fs_field_we: false + status_fs_field_we_enable: false + status_vs_field_we: false + status_vs_field_we_enable: false + misa_we: false + mstatus_write_mask: 0x00000088 + mstatus_override_mask: 0x00001800 + mtval_write_mask: 0x00000000 + unified_traps: true + + +Spike Updater read spike.yaml and update the parameters so if you want to add parameter in spike.yaml you need to respect it architecture. + ## Annexes diff --git a/verif/tests/custom/common/test.ld b/config/gen_from_riscv_config/cv32a60x/linker/link.ld similarity index 81% rename from verif/tests/custom/common/test.ld rename to config/gen_from_riscv_config/cv32a60x/linker/link.ld index a50b017e81..a134ec289a 100644 --- a/verif/tests/custom/common/test.ld +++ b/config/gen_from_riscv_config/cv32a60x/linker/link.ld @@ -23,13 +23,29 @@ SECTIONS /* text: test code section */ . = 0x80000000; + _start_text = .; .text.init : { *(.text.init) } . = ALIGN(0x1000); .tohost : { *(.tohost) } + . = ALIGN(0x1000); + .uvmif : { *(.uvmif) } + . = ALIGN(0x1000); .text : { *(.text) } + . = ALIGN(0x1000); + .text.startup : { *(.text.startup) } + . = ALIGN(0x1000); + _end_text = .; + . = ALIGN(0x1000); + .rodata : { *(.rodata*)} + . = ALIGN(0x8); + . = ALIGN(0x1000); + .page_table : { *(.page_table) } + .user_stack : { *(.user_stack) } + .kernel_data : { *(.kernel_data) } + .kernel_stack : { *(.kernel_stack) } /* data segment */ .data : { *(.data) } diff --git a/config/gen_from_riscv_config/cv32a60x/spike/spike.yaml b/config/gen_from_riscv_config/cv32a60x/spike/spike.yaml new file mode 100644 index 0000000000..cc07e68fc5 --- /dev/null +++ b/config/gen_from_riscv_config/cv32a60x/spike/spike.yaml @@ -0,0 +1,54 @@ +spike_param_tree: + bootrom: true + bootrom_base: 65536 + bootrom_size: 4096 + dram: true + dram_base: 2147483648 + dram_size: 1073741824 + generic_core_config: false + max_steps: 200000 + max_steps_enabled: false + isa: rv32imczicsr_zcb_zba_zbb_zbc_zbs + priv: M + core_configs: + - + isa: rv32imczicsr_zcb_zba_zbb_zbc_zbs + extensions: cv32a60x,cvxif + boot_addr: 2147483648 + marchid_override_mask: 0xFFFFFFFF + marchid_override_value: 0x3 + misa_write_mask: 0x0 + pmp_granularity: 8 + pmpaddr0: 0 + pmpcfg0: 0 + pmpregions_max: 64 + pmpregions_writable: 8 + priv: M + status_fs_field_we: false + status_fs_field_we_enable: false + status_vs_field_we: false + status_vs_field_we_enable: false + mstatus_write_mask: 136 + mstatus_override_mask: 6144 + mie_write_mask: 0x00000880 + mie_override_mask: 0xfffff77f + mie_override_value: 0x00000000 + mip_write_mask: 0x00000000 + mip_override_mask: 0xfffff77f + mip_override_value: 0x00000000 + mtval_write_mask: 0 + tinfo_accessible: 0 + mscontext_accessible: 0 + mcontext_accessible: 0 + tdata1_accessible: 0 + tdata2_accessible: 0 + tdata3_accessible: 0 + tselect_accessible: 0 + mhartid: 0 + mvendorid_override_mask : 0xFFFFFFFF + mvendorid_override_value: 1538 + csr_counters_injection: true + interrupts_injection: true + unified_traps: true + mcycleh_implemented: false + mhpmevent31_implemented: false diff --git a/config/gen_from_riscv_config/cv32a65x/csr/csr.adoc b/config/gen_from_riscv_config/cv32a65x/csr/csr.adoc new file mode 100644 index 0000000000..6d5279704b --- /dev/null +++ b/config/gen_from_riscv_config/cv32a65x/csr/csr.adoc @@ -0,0 +1,537 @@ +//// + Copyright (c) 2024 OpenHW Group + Copyright (c) 2024 Thales + SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 + Author: Abdessamii Oukalrazqou +//// + +=== csr + +==== Conventions + +In the subsequent sections, register fields are labeled with one of the following abbreviations: + +* WPRI (Writes Preserve Values, Reads Ignore Values): read/write field reserved +for future use. For forward compatibility, implementations that do not +furnish these fields must make them read-only zero. +* WLRL (Write/Read Only Legal Values): read/write CSR field that specifies +behavior for only a subset of possible bit encodings, with other bit encodings +reserved. +* WARL (Write Any Values, Reads Legal Values): read/write CSR fields which are +only defined for a subset of bit encodings, but allow any value to be written +while guaranteeing to return a legal value whenever read. +* ROCST (Read-Only Constant): A special case of WARL field which admits only one +legal value, and therefore, behaves as a constant field that silently ignores +writes. +* ROVAR (Read-Only Variable): A special case of WARL field which can take +multiple legal values but cannot be modified by software and depends only on +the architectural state of the hart. + +In particular, a register that is not internally divided +into multiple fields can be considered as containing a single field of XLEN bits. +This allows to clearly represent read-write registers holding a single legal value +(typically zero). + +==== Register Summary + +|=== +|Address | Register Name | Privilege | Description + +|0x300| `<<_MSTATUS,MSTATUS>>`|MRW|The mstatus register keeps track of and controls the hart's current operating state. +|0x301| `<<_MISA,MISA>>`|MRW|misa is a read-write register reporting the ISA supported by the hart. +|0x304| `<<_MIE,MIE>>`|MRW|The mie register is an MXLEN-bit read/write register containing interrupt enable bits. +|0x305| `<<_MTVEC,MTVEC>>`|MRW|MXLEN-bit read/write register that holds trap vector configuration. +|0x310| `<<_MSTATUSH,MSTATUSH>>`|MRW|The mstatush register keeps track of and controls the hart’s current operating state. +|0x323-0x33f| `<<_MHPMEVENT3-31,MHPMEVENT[3-31]>>`|MRW|The mhpmevent is a MXLEN-bit event register which controls mhpmcounter3. +|0x340| `<<_MSCRATCH,MSCRATCH>>`|MRW|The mscratch register is an MXLEN-bit read/write register dedicated for use by machine mode. +|0x341| `<<_MEPC,MEPC>>`|MRW|The mepc is a warl register that must be able to hold all valid physical and virtual addresses. +|0x342| `<<_MCAUSE,MCAUSE>>`|MRW|The mcause register stores the information regarding the trap. +|0x343| `<<_MTVAL,MTVAL>>`|MRW|The mtval is a warl register that holds the address of the instruction which caused the exception. +|0x344| `<<_MIP,MIP>>`|MRW|The mip register is an MXLEN-bit read/write register containing information on pending interrupts. +|0x3a0-0x3a1| `<<_PMPCFG0-1,PMPCFG[0-1]>>`|MRW|PMP configuration register +|0x3a2-0x3af| `<<_PMPCFG2-15,PMPCFG[2-15]>>`|MRW|PMP configuration register +|0x3b0-0x3b7| `<<_PMPADDR0-7,PMPADDR[0-7]>>`|MRW|Physical memory protection address register +|0x3b8-0x3ef| `<<_PMPADDR8-63,PMPADDR[8-63]>>`|MRW|Physical memory protection address register +|0x7c0| `<<_ICACHE,ICACHE>>`|MRW|the register controls the operation of the i-cache unit. +|0x7c1| `<<_DCACHE,DCACHE>>`|MRW|the register controls the operation of the d-cache unit. +|0xb00| `<<_MCYCLE,MCYCLE>>`|MRW|Counts the number of clock cycles executed from an arbitrary point in time. +|0xb02| `<<_MINSTRET,MINSTRET>>`|MRW|Counts the number of instructions completed from an arbitrary point in time. +|0xb03-0xb1f| `<<_MHPMCOUNTER3-31,MHPMCOUNTER[3-31]>>`|MRW|The mhpmcounter is a 64-bit counter. Returns lower 32 bits in RV32I mode. +|0xb80| `<<_MCYCLEH,MCYCLEH>>`|MRW|upper 32 bits of mcycle +|0xb82| `<<_MINSTRETH,MINSTRETH>>`|MRW|Upper 32 bits of minstret. +|0xb83-0xb9f| `<<_MHPMCOUNTER3-31H,MHPMCOUNTER[3-31]H>>`|MRW|The mhpmcounterh returns the upper half word in RV32I systems. +|0xf11| `<<_MVENDORID,MVENDORID>>`|MRO|32-bit read-only register providing the JEDEC manufacturer ID of the provider of the core. +|0xf12| `<<_MARCHID,MARCHID>>`|MRO|MXLEN-bit read-only register encoding the base microarchitecture of the hart. +|0xf13| `<<_MIMPID,MIMPID>>`|MRO|Provides a unique encoding of the version of the processor implementation. +|0xf14| `<<_MHARTID,MHARTID>>`|MRO|MXLEN-bit read-only register containing the integer ID of the hardware thread running the code. +|0xf15| `<<_MCONFIGPTR,MCONFIGPTR>>`|MRO|MXLEN-bit read-only register that holds the physical address of a configuration data structure. +|=== + +==== Register Description + +[[_MSTATUS]] +===== MSTATUS + +Address:: 0x300 +Reset Value:: 0x00001800 +Privilege:: MRW +Description:: The mstatus register keeps track of and controls the hart's current operating state. + +|=== +| Bits | Field Name | Reset Value | Type | Legal Values | Description + +| 0 | UIE | 0x0 | ROCST | 0x0 | Stores the state of the user mode interrupts. +| 1 | SIE | 0x0 | ROCST | 0x0 | Stores the state of the supervisor mode interrupts. +| 2 | RESERVED_2 | 0x0 | WPRI | | _Reserved_ +| 3 | MIE | 0x0 | WLRL | 0x0 - 0x1 | Stores the state of the machine mode interrupts. +| 4 | UPIE | 0x0 | ROCST | 0x0 | Stores the state of the user mode interrupts prior to the trap. +| 5 | SPIE | 0x0 | ROCST | 0x0 | Stores the state of the supervisor mode interrupts prior to the trap. +| 6 | UBE | 0x0 | ROCST | 0x0 | control the endianness of memory accesses other than instruction fetches for user mode +| 7 | MPIE | 0x0 | WLRL | 0x0 - 0x1 | Stores the state of the machine mode interrupts prior to the trap. +| 8 | SPP | 0x0 | ROCST | 0x0 | Stores the previous priority mode for supervisor. +| [10:9] | RESERVED_9 | 0x0 | WPRI | | _Reserved_ +| [12:11] | MPP | 0x3 | WARL | 0x3 | Stores the previous priority mode for machine. +| [14:13] | FS | 0x0 | ROCST | 0x0 | Encodes the status of the floating-point unit, including the CSR fcsr and floating-point data registers. +| [16:15] | XS | 0x0 | ROCST | 0x0 | Encodes the status of additional user-mode extensions and associated state. +| 17 | MPRV | 0x0 | ROCST | 0x0 | Modifies the privilege level at which loads and stores execute in all privilege modes. +| 18 | SUM | 0x0 | ROCST | 0x0 | Modifies the privilege with which S-mode loads and stores access virtual memory. +| 19 | MXR | 0x0 | ROCST | 0x0 | Modifies the privilege with which loads access virtual memory. +| 20 | TVM | 0x0 | ROCST | 0x0 | Supports intercepting supervisor virtual-memory management operations. +| 21 | TW | 0x0 | ROCST | 0x0 | Supports intercepting the WFI instruction. +| 22 | TSR | 0x0 | ROCST | 0x0 | Supports intercepting the supervisor exception return instruction. +| 23 | SPELP | 0x0 | ROCST | 0x0 | Supervisor mode previous expected-landing-pad (ELP) state. +| [30:24] | RESERVED_24 | 0x0 | WPRI | | _Reserved_ +| 31 | SD | 0x0 | ROCST | 0x0 | Read-only bit that summarizes whether either the FS field or XS field signals the presence of some dirty state. +|=== + +[[_MISA]] +===== MISA + +Address:: 0x301 +Reset Value:: 0x40001106 +Privilege:: MRW +Description:: misa is a read-write register reporting the ISA supported by the hart. + +|=== +| Bits | Field Name | Reset Value | Type | Legal Values | Description + +| [25:0] | EXTENSIONS | 0x1106 | ROCST | 0x1106 | Encodes the presence of the standard extensions, with a single bit per letter of the alphabet. +| [29:26] | RESERVED_26 | 0x0 | WPRI | | _Reserved_ +| [31:30] | MXL | 0x1 | WARL | 0x1 | Encodes the native base integer ISA width. +|=== + +[[_MIE]] +===== MIE + +Address:: 0x304 +Reset Value:: 0x00000000 +Privilege:: MRW +Description:: The mie register is an MXLEN-bit read/write register containing interrupt enable bits. + +|=== +| Bits | Field Name | Reset Value | Type | Legal Values | Description + +| 0 | USIE | 0x0 | ROCST | 0x0 | User Software Interrupt enable. +| 1 | SSIE | 0x0 | ROCST | 0x0 | Supervisor Software Interrupt enable. +| 2 | VSSIE | 0x0 | ROCST | 0x0 | VS-level Software Interrupt enable. +| 3 | MSIE | 0x0 | ROCST | 0x0 | Machine Software Interrupt enable. +| 4 | UTIE | 0x0 | ROCST | 0x0 | User Timer Interrupt enable. +| 5 | STIE | 0x0 | ROCST | 0x0 | Supervisor Timer Interrupt enable. +| 6 | VSTIE | 0x0 | ROCST | 0x0 | VS-level Timer Interrupt enable. +| 7 | MTIE | 0x0 | WLRL | 0x0 - 0x1 | Machine Timer Interrupt enable. +| 8 | UEIE | 0x0 | ROCST | 0x0 | User External Interrupt enable. +| 9 | SEIE | 0x0 | ROCST | 0x0 | Supervisor External Interrupt enable. +| 10 | VSEIE | 0x0 | ROCST | 0x0 | VS-level External Interrupt enable. +| 11 | MEIE | 0x0 | WLRL | 0x0 - 0x1 | Machine External Interrupt enable. +| 12 | SGEIE | 0x0 | ROCST | 0x0 | HS-level External Interrupt enable. +| [31:13] | RESERVED_13 | 0x0 | WPRI | | _Reserved_ +|=== + +[[_MTVEC]] +===== MTVEC + +Address:: 0x305 +Reset Value:: 0x80010000 +Privilege:: MRW +Description:: MXLEN-bit read/write register that holds trap vector configuration. + +|=== +| Bits | Field Name | Reset Value | Type | Legal Values | Description + +| [1:0] | MODE | 0x0 | WARL | 0x0 | Vector mode. +| [31:2] | BASE | 0x20004000 | WARL | 0x00000000 - 0x3FFFFFFF | Vector base address. +|=== + +[[_MSTATUSH]] +===== MSTATUSH + +Address:: 0x310 +Reset Value:: 0x00000000 +Privilege:: MRW +Description:: The mstatush register keeps track of and controls the hart’s current operating state. + +|=== +| Bits | Field Name | Reset Value | Type | Legal Values | Description + +| [3:0] | RESERVED_0 | 0x0 | WPRI | | _Reserved_ +| 4 | SBE | 0x0 | ROCST | 0x0 | control the endianness of memory accesses other than instruction fetches for supervisor mode +| 5 | MBE | 0x0 | ROCST | 0x0 | control the endianness of memory accesses other than instruction fetches for machine mode +| 6 | GVA | 0x0 | ROCST | 0x0 | Stores the state of the supervisor mode interrupts. +| 7 | MPV | 0x0 | ROCST | 0x0 | Stores the state of the user mode interrupts. +| 8 | RESERVED_8 | 0x0 | WPRI | | _Reserved_ +| 9 | MPELP | 0x0 | ROCST | 0x0 | Machine mode previous expected-landing-pad (ELP) state. +| [31:10] | RESERVED_10 | 0x0 | WPRI | | _Reserved_ +|=== + +[[_MHPMEVENT3-31]] +===== MHPMEVENT[3-31] + +Address:: 0x323-0x33f +Reset Value:: 0x00000000 +Privilege:: MRW +Description:: The mhpmevent is a MXLEN-bit event register which controls mhpmcounter3. + +|=== +| Bits | Field Name | Reset Value | Type | Legal Values | Description + +| [31:0] | MHPMEVENT[I] | 0x00000000 | ROCST | 0x0 | The mhpmevent is a MXLEN-bit event register which controls mhpmcounter3. +|=== + +[[_MSCRATCH]] +===== MSCRATCH + +Address:: 0x340 +Reset Value:: 0x00000000 +Privilege:: MRW +Description:: The mscratch register is an MXLEN-bit read/write register dedicated for use by machine mode. + +|=== +| Bits | Field Name | Reset Value | Type | Legal Values | Description + +| [31:0] | MSCRATCH | 0x00000000 | WARL | 0x00000000 - 0xFFFFFFFF | The mscratch register is an MXLEN-bit read/write register dedicated for use by machine mode. +|=== + +[[_MEPC]] +===== MEPC + +Address:: 0x341 +Reset Value:: 0x00000000 +Privilege:: MRW +Description:: The mepc is a warl register that must be able to hold all valid physical and virtual addresses. + +|=== +| Bits | Field Name | Reset Value | Type | Legal Values | Description + +| [31:0] | MEPC | 0x00000000 | WARL | 0x00000000 - 0xFFFFFFFF | The mepc is a warl register that must be able to hold all valid physical and virtual addresses. +|=== + +[[_MCAUSE]] +===== MCAUSE + +Address:: 0x342 +Reset Value:: 0x00000000 +Privilege:: MRW +Description:: The mcause register stores the information regarding the trap. + +|=== +| Bits | Field Name | Reset Value | Type | Legal Values | Description + +| [30:0] | EXCEPTION_CODE | 0x0 | WLRL | 0x0 - 0x8, 0xb | Encodes the exception code. +| 31 | INTERRUPT | 0x0 | WLRL | 0x0 - 0x1 | Indicates whether the trap was due to an interrupt. +|=== + +[[_MTVAL]] +===== MTVAL + +Address:: 0x343 +Reset Value:: 0x00000000 +Privilege:: MRW +Description:: The mtval is a warl register that holds the address of the instruction which caused the exception. + +|=== +| Bits | Field Name | Reset Value | Type | Legal Values | Description + +| [31:0] | MTVAL | 0x00000000 | ROCST | 0x0 | The mtval is a warl register that holds the address of the instruction which caused the exception. +|=== + +[[_MIP]] +===== MIP + +Address:: 0x344 +Reset Value:: 0x00000000 +Privilege:: MRW +Description:: The mip register is an MXLEN-bit read/write register containing information on pending interrupts. + +|=== +| Bits | Field Name | Reset Value | Type | Legal Values | Description + +| 0 | USIP | 0x0 | ROCST | 0x0 | User Software Interrupt Pending. +| 1 | SSIP | 0x0 | ROCST | 0x0 | Supervisor Software Interrupt Pending. +| 2 | VSSIP | 0x0 | ROCST | 0x0 | VS-level Software Interrupt Pending. +| 3 | MSIP | 0x0 | ROCST | 0x0 | Machine Software Interrupt Pending. +| 4 | UTIP | 0x0 | ROCST | 0x0 | User Timer Interrupt Pending. +| 5 | STIP | 0x0 | ROCST | 0x0 | Supervisor Timer Interrupt Pending. +| 6 | VSTIP | 0x0 | ROCST | 0x0 | VS-level Timer Interrupt Pending. +| 7 | MTIP | 0x0 | ROVAR | 0x0 - 0x1 | Machine Timer Interrupt Pending. +| 8 | UEIP | 0x0 | ROCST | 0x0 | User External Interrupt Pending. +| 9 | SEIP | 0x0 | ROCST | 0x0 | Supervisor External Interrupt Pending. +| 10 | VSEIP | 0x0 | ROCST | 0x0 | VS-level External Interrupt Pending. +| 11 | MEIP | 0x0 | ROVAR | 0x0 - 0x1 | Machine External Interrupt Pending. +| 12 | SGEIP | 0x0 | ROCST | 0x0 | HS-level External Interrupt Pending. +| [31:13] | RESERVED_13 | 0x0 | WPRI | | _Reserved_ +|=== + +[[_PMPCFG0-1]] +===== PMPCFG[0-1] + +Address:: 0x3a0-0x3a1 +Reset Value:: 0x00000000 +Privilege:: MRW +Description:: PMP configuration register + +|=== +| Bits | Field Name | Reset Value | Type | Legal Values | Description + +| [2:0] | PMP[I*4 +0]CFG.RWX | 0x0 | WARL | 0x0, 0x1, 0x3, 0x4, 0x5, 0x7 | PMP[I*4 +0]CFG collective R, W and X field (R is bit 0, X is bit 2) +| [5:4] | PMP[I*4 +0]CFG.A | 0x0 | WARL | 0x0 - 0x1 | PMP[I*4 +0]CFG address-matching mode (A) +| 7 | PMP[I*4 +0]CFG.L | 0x0 | WARL | 0x0 - 0x1 | PMP[I*4 +0]CFG entry locked (L) +| [2:0] | PMP[I*4 +1]CFG | 0x0 | WARL | 0x0, 0x1, 0x3, 0x4, 0x5, 0x7 | PMP[I*4 +1]CFG collective R, W and X field (R is bit 0, X is bit 2) +| [13:12] | PMP[I*4 +1]CFG | 0x0 | WARL | 0x0 - 0x1 | PMP[I*4 +1]CFG address-matching mode (A) +| 15 | PMP[I*4 +1]CFG | 0x0 | WARL | 0x0 - 0x1 | PMP[I*4 +1]CFG entry locked (L) +| [18:16] | PMP[I*4 +2]CFG | 0x0 | WARL | 0x0, 0x1, 0x3, 0x4, 0x5, 0x7 | PMP[I*4 +2]CFG collective R, W and X field (R is bit 0, X is bit 2) +| [21:20] | PMP[I*4 +2]CFG | 0x0 | WARL | 0x0 - 0x1 | PMP[I*4 +2]CFG address-matching mode (A) +| 23 | PMP[I*4 +2]CFG | 0x0 | WARL | 0x0 - 0x1 | PMP[I*4 +2]CFG entry locked (L) +| [26:24] | PMP[I*4 +3]CFG | 0x0 | WARL | 0x0, 0x1, 0x3, 0x4, 0x5, 0x7 | PMP[I*4 +3]CFG collective R, W and X field (R is bit 0, X is bit 2) +| [29:28] | PMP[I*4 +3]CFG | 0x0 | WARL | 0x0 - 0x1 | PMP[I*4 +3]CFG address matching mode (A) +| 31 | PMP[I*4 +3]CFG | 0x0 | WARL | 0x0 - 0x1 | PMP[I*4 +3]CFG entry locked (L) +|=== + +[[_PMPCFG2-15]] +===== PMPCFG[2-15] + +Address:: 0x3a2-0x3af +Reset Value:: 0x00000000 +Privilege:: MRW +Description:: PMP configuration register + +|=== +| Bits | Field Name | Reset Value | Type | Legal Values | Description + +| [7:0] | PMP[I*4 +0]CFG | 0x0 | ROCST | 0x0 | pmp configuration bits +| [15:8] | PMP[I*4 +1]CFG | 0x0 | ROCST | 0x0 | pmp configuration bits +| [23:16] | PMP[I*4 +2]CFG | 0x0 | ROCST | 0x0 | pmp configuration bits +| [31:24] | PMP[I*4 +3]CFG | 0x0 | ROCST | 0x0 | pmp configuration bits +|=== + +[[_PMPADDR0-7]] +===== PMPADDR[0-7] + +Address:: 0x3b0-0x3b7 +Reset Value:: 0x00000000 +Privilege:: MRW +Description:: Physical memory protection address register + +|=== +| Bits | Field Name | Reset Value | Type | Legal Values | Description + +| [31:0] | PMPADDR[I] | 0x00000000 | WARL | 0x00000000 - 0xFFFFFFFF | Physical memory protection address register +|=== + +[[_PMPADDR8-63]] +===== PMPADDR[8-63] + +Address:: 0x3b8-0x3ef +Reset Value:: 0x00000000 +Privilege:: MRW +Description:: Physical memory protection address register + +|=== +| Bits | Field Name | Reset Value | Type | Legal Values | Description + +| [31:0] | PMPADDR[I] | 0x00000000 | ROCST | 0x0 | Physical memory protection address register +|=== + +[[_ICACHE]] +===== ICACHE + +Address:: 0x7c0 +Reset Value:: 0x00000001 +Privilege:: MRW +Description:: the register controls the operation of the i-cache unit. + +|=== +| Bits | Field Name | Reset Value | Type | Legal Values | Description + +| 0 | ICACHE | 0x1 | RW | 0x1 | bit for cache-enable of instruction cache +| [31:1] | RESERVED_1 | 0x0 | WPRI | | _Reserved_ +|=== + +[[_DCACHE]] +===== DCACHE + +Address:: 0x7c1 +Reset Value:: 0x00000001 +Privilege:: MRW +Description:: the register controls the operation of the d-cache unit. + +|=== +| Bits | Field Name | Reset Value | Type | Legal Values | Description + +| 0 | DCACHE | 0x1 | RW | 0x1 | bit for cache-enable of data cache +| [31:1] | RESERVED_1 | 0x0 | WPRI | | _Reserved_ +|=== + +[[_MCYCLE]] +===== MCYCLE + +Address:: 0xb00 +Reset Value:: 0x00000000 +Privilege:: MRW +Description:: Counts the number of clock cycles executed from an arbitrary point in time. + +|=== +| Bits | Field Name | Reset Value | Type | Legal Values | Description + +| [31:0] | MCYCLE | 0x00000000 | WARL | 0x00000000 - 0xFFFFFFFF | Counts the number of clock cycles executed from an arbitrary point in time. +|=== + +[[_MINSTRET]] +===== MINSTRET + +Address:: 0xb02 +Reset Value:: 0x00000000 +Privilege:: MRW +Description:: Counts the number of instructions completed from an arbitrary point in time. + +|=== +| Bits | Field Name | Reset Value | Type | Legal Values | Description + +| [31:0] | MINSTRET | 0x00000000 | WARL | 0x00000000 - 0xFFFFFFFF | Counts the number of instructions completed from an arbitrary point in time. +|=== + +[[_MHPMCOUNTER3-31]] +===== MHPMCOUNTER[3-31] + +Address:: 0xb03-0xb1f +Reset Value:: 0x00000000 +Privilege:: MRW +Description:: The mhpmcounter is a 64-bit counter. Returns lower 32 bits in RV32I mode. + +|=== +| Bits | Field Name | Reset Value | Type | Legal Values | Description + +| [31:0] | MHPMCOUNTER[I] | 0x00000000 | ROCST | 0x0 | The mhpmcounter is a 64-bit counter. Returns lower 32 bits in RV32I mode. +|=== + +[[_MCYCLEH]] +===== MCYCLEH + +Address:: 0xb80 +Reset Value:: 0x00000000 +Privilege:: MRW +Description:: upper 32 bits of mcycle + +|=== +| Bits | Field Name | Reset Value | Type | Legal Values | Description + +| [31:0] | MCYCLEH | 0x00000000 | WARL | 0x00000000 - 0xFFFFFFFF | upper 32 bits of mcycle +|=== + +[[_MINSTRETH]] +===== MINSTRETH + +Address:: 0xb82 +Reset Value:: 0x00000000 +Privilege:: MRW +Description:: Upper 32 bits of minstret. + +|=== +| Bits | Field Name | Reset Value | Type | Legal Values | Description + +| [31:0] | MINSTRETH | 0x00000000 | WARL | 0x00000000 - 0xFFFFFFFF | Upper 32 bits of minstret. +|=== + +[[_MHPMCOUNTER3-31H]] +===== MHPMCOUNTER[3-31]H + +Address:: 0xb83-0xb9f +Reset Value:: 0x00000000 +Privilege:: MRW +Description:: The mhpmcounterh returns the upper half word in RV32I systems. + +|=== +| Bits | Field Name | Reset Value | Type | Legal Values | Description + +| [31:0] | MHPMCOUNTER[I]H | 0x00000000 | ROCST | 0x0 | The mhpmcounterh returns the upper half word in RV32I systems. +|=== + +[[_MVENDORID]] +===== MVENDORID + +Address:: 0xf11 +Reset Value:: 0x00000602 +Privilege:: MRO +Description:: 32-bit read-only register providing the JEDEC manufacturer ID of the provider of the core. + +|=== +| Bits | Field Name | Reset Value | Type | Legal Values | Description + +| [31:0] | MVENDORID | 0x00000602 | ROCST | 0x602 | 32-bit read-only register providing the JEDEC manufacturer ID of the provider of the core. +|=== + +[[_MARCHID]] +===== MARCHID + +Address:: 0xf12 +Reset Value:: 0x00000003 +Privilege:: MRO +Description:: MXLEN-bit read-only register encoding the base microarchitecture of the hart. + +|=== +| Bits | Field Name | Reset Value | Type | Legal Values | Description + +| [31:0] | MARCHID | 0x00000003 | ROCST | 0x3 | MXLEN-bit read-only register encoding the base microarchitecture of the hart. +|=== + +[[_MIMPID]] +===== MIMPID + +Address:: 0xf13 +Reset Value:: 0x00000000 +Privilege:: MRO +Description:: Provides a unique encoding of the version of the processor implementation. + +|=== +| Bits | Field Name | Reset Value | Type | Legal Values | Description + +| [31:0] | MIMPID | 0x00000000 | ROCST | 0x0 | Provides a unique encoding of the version of the processor implementation. +|=== + +[[_MHARTID]] +===== MHARTID + +Address:: 0xf14 +Reset Value:: 0x00000000 +Privilege:: MRO +Description:: MXLEN-bit read-only register containing the integer ID of the hardware thread running the code. + +|=== +| Bits | Field Name | Reset Value | Type | Legal Values | Description + +| [31:0] | MHARTID | 0x00000000 | ROCST | 0x0 | MXLEN-bit read-only register containing the integer ID of the hardware thread running the code. +|=== + +[[_MCONFIGPTR]] +===== MCONFIGPTR + +Address:: 0xf15 +Reset Value:: 0x00000000 +Privilege:: MRO +Description:: MXLEN-bit read-only register that holds the physical address of a configuration data structure. + +|=== +| Bits | Field Name | Reset Value | Type | Legal Values | Description + +| [31:0] | MCONFIGPTR | 0x00000000 | ROCST | 0x0 | MXLEN-bit read-only register that holds the physical address of a configuration data structure. +|=== + diff --git a/config/gen_from_riscv_config/cv32a65x/csr/csr.rst b/config/gen_from_riscv_config/cv32a65x/csr/csr.rst index ad95398249..7053c1f181 100644 --- a/config/gen_from_riscv_config/cv32a65x/csr/csr.rst +++ b/config/gen_from_riscv_config/cv32a65x/csr/csr.rst @@ -65,9 +65,13 @@ Register Summary +-------------+---------------------------------------------+-------------+----------------------------------------------------------------------------------------------------+ | 0x344 | `MIP <#MIP>`_ | MRW | The mip register is an MXLEN-bit read/write register containing information on pending interrupts. | +-------------+---------------------------------------------+-------------+----------------------------------------------------------------------------------------------------+ -| 0x3a0-0x3a3 | `PMPCFG[0-3] <#PMPCFG[0-3]>`_ | MRW | PMP configuration register | +| 0x3a0-0x3a1 | `PMPCFG[0-1] <#PMPCFG[0-1]>`_ | MRW | PMP configuration register | +-------------+---------------------------------------------+-------------+----------------------------------------------------------------------------------------------------+ -| 0x3b0-0x3bf | `PMPADDR[0-15] <#PMPADDR[0-15]>`_ | MRW | Physical memory protection address register | +| 0x3a2-0x3af | `PMPCFG[2-15] <#PMPCFG[2-15]>`_ | MRW | PMP configuration register | ++-------------+---------------------------------------------+-------------+----------------------------------------------------------------------------------------------------+ +| 0x3b0-0x3b7 | `PMPADDR[0-7] <#PMPADDR[0-7]>`_ | MRW | Physical memory protection address register | ++-------------+---------------------------------------------+-------------+----------------------------------------------------------------------------------------------------+ +| 0x3b8-0x3ef | `PMPADDR[8-63] <#PMPADDR[8-63]>`_ | MRW | Physical memory protection address register | +-------------+---------------------------------------------+-------------+----------------------------------------------------------------------------------------------------+ | 0x7c0 | `ICACHE <#ICACHE>`_ | MRW | the register controls the operation of the i-cache unit. | +-------------+---------------------------------------------+-------------+----------------------------------------------------------------------------------------------------+ @@ -206,7 +210,7 @@ MIE +---------+--------------+---------------+--------+----------------+---------------------------------------+ | 6 | VSTIE | 0x0 | ROCST | 0x0 | VS-level Timer Interrupt enable. | +---------+--------------+---------------+--------+----------------+---------------------------------------+ -| 7 | MTIE | 0x0 | ROVAR | 0x0 - 0x1 | Machine Timer Interrupt enable. | +| 7 | MTIE | 0x0 | WLRL | 0x0 - 0x1 | Machine Timer Interrupt enable. | +---------+--------------+---------------+--------+----------------+---------------------------------------+ | 8 | UEIE | 0x0 | ROCST | 0x0 | User External Interrupt enable. | +---------+--------------+---------------+--------+----------------+---------------------------------------+ @@ -214,7 +218,7 @@ MIE +---------+--------------+---------------+--------+----------------+---------------------------------------+ | 10 | VSEIE | 0x0 | ROCST | 0x0 | VS-level External Interrupt enable. | +---------+--------------+---------------+--------+----------------+---------------------------------------+ -| 11 | MEIE | 0x0 | ROVAR | 0x0 - 0x1 | Machine External Interrupt enable. | +| 11 | MEIE | 0x0 | WLRL | 0x0 - 0x1 | Machine External Interrupt enable. | +---------+--------------+---------------+--------+----------------+---------------------------------------+ | 12 | SGEIE | 0x0 | ROCST | 0x0 | HS-level External Interrupt enable. | +---------+--------------+---------------+--------+----------------+---------------------------------------+ @@ -285,7 +289,7 @@ MHPMEVENT[3-31] +--------+--------------+---------------+--------+----------------+--------------------------------------------------------------------------+ | Bits | Field Name | Reset Value | Type | Legal Values | Description | +========+==============+===============+========+================+==========================================================================+ -| [31:0] | MHPMEVENT[I] | 0x00000000 | ROCST | 0x00000000 | The mhpmevent is a MXLEN-bit event register which controls mhpmcounter3. | +| [31:0] | MHPMEVENT[I] | 0x00000000 | ROCST | 0x0 | The mhpmevent is a MXLEN-bit event register which controls mhpmcounter3. | +--------+--------------+---------------+--------+----------------+--------------------------------------------------------------------------+ @@ -336,7 +340,7 @@ MCAUSE +--------+----------------+---------------+--------+----------------+-----------------------------------------------------+ | Bits | Field Name | Reset Value | Type | Legal Values | Description | +========+================+===============+========+================+=====================================================+ -| [30:0] | EXCEPTION_CODE | 0x0 | WLRL | 0 - 15 | Encodes the exception code. | +| [30:0] | EXCEPTION_CODE | 0x0 | WLRL | 0x0 - 0x8, 0xb | Encodes the exception code. | +--------+----------------+---------------+--------+----------------+-----------------------------------------------------+ | 31 | INTERRUPT | 0x0 | WLRL | 0x0 - 0x1 | Indicates whether the trap was due to an interrupt. | +--------+----------------+---------------+--------+----------------+-----------------------------------------------------+ @@ -355,7 +359,7 @@ MTVAL +--------+--------------+---------------+--------+----------------+----------------------------------------------------------------------------------------------------+ | Bits | Field Name | Reset Value | Type | Legal Values | Description | +========+==============+===============+========+================+====================================================================================================+ -| [31:0] | MTVAL | 0x00000000 | ROCST | 0x00000000 | The mtval is a warl register that holds the address of the instruction which caused the exception. | +| [31:0] | MTVAL | 0x00000000 | ROCST | 0x0 | The mtval is a warl register that holds the address of the instruction which caused the exception. | +--------+--------------+---------------+--------+----------------+----------------------------------------------------------------------------------------------------+ @@ -402,33 +406,55 @@ MIP +---------+--------------+---------------+--------+----------------+----------------------------------------+ -.. .. _PMPCFG[0-3]::: -PMPCFG[0-3] +.. .. _PMPCFG[0-1]::: +PMPCFG[0-1] ~~~~~~~~~~~ -:Address: 0x3a0-0x3af +:Address: 0x3a0-0x3a1 :Reset Value: 0x00000000 :Privilege: MRW :Description: PMP configuration register -+---------+-----------------+---------------+--------+----------------+------------------------+ -| Bits | Field Name | Reset Value | Type | Legal Values | Description | -+=========+=================+===============+========+================+========================+ -| [7:0] | PMP[I*4 + 0]CFG | 0x0 | WARL | 0x00 - 0xFF | pmp configuration bits | -+---------+-----------------+---------------+--------+----------------+------------------------+ -| [15:8] | PMP[I*4 + 1]CFG | 0x0 | WARL | 0x00 - 0xFF | pmp configuration bits | -+---------+-----------------+---------------+--------+----------------+------------------------+ -| [23:16] | PMP[I*4 + 2]CFG | 0x0 | WARL | 0x00 - 0xFF | pmp configuration bits | -+---------+-----------------+---------------+--------+----------------+------------------------+ -| [31:24] | PMP[I*4 + 3]CFG | 0x0 | WARL | 0x00 - 0xFF | pmp configuration bits | -+---------+-----------------+---------------+--------+----------------+------------------------+ - - -.. .. _PMPADDR[0-15]::: -PMPADDR[0-15] -~~~~~~~~~~~~~ ++---------+----------------+---------------+--------+----------------------+------------------------+ +| Bits | Field Name | Reset Value | Type | Legal Values | Description | ++=========+================+===============+========+======================+========================+ +| [7:0] | PMP[I*4 +0]CFG | 0x0 | WARL | masked: & 0x8f | 0x0 | pmp configuration bits | ++---------+----------------+---------------+--------+----------------------+------------------------+ +| [15:8] | PMP[I*4 +1]CFG | 0x0 | WARL | masked: & 0x8f | 0x0 | pmp configuration bits | ++---------+----------------+---------------+--------+----------------------+------------------------+ +| [23:16] | PMP[I*4 +2]CFG | 0x0 | WARL | masked: & 0x8f | 0x0 | pmp configuration bits | ++---------+----------------+---------------+--------+----------------------+------------------------+ +| [31:24] | PMP[I*4 +3]CFG | 0x0 | WARL | masked: & 0x8f | 0x0 | pmp configuration bits | ++---------+----------------+---------------+--------+----------------------+------------------------+ + + +.. .. _PMPCFG[2-15]::: +PMPCFG[2-15] +~~~~~~~~~~~~ + +:Address: 0x3a2-0x3af +:Reset Value: 0x00000000 +:Privilege: MRW +:Description: PMP configuration register + ++---------+----------------+---------------+--------+----------------+------------------------+ +| Bits | Field Name | Reset Value | Type | Legal Values | Description | ++=========+================+===============+========+================+========================+ +| [7:0] | PMP[I*4 +0]CFG | 0x0 | ROCST | 0x0 | pmp configuration bits | ++---------+----------------+---------------+--------+----------------+------------------------+ +| [15:8] | PMP[I*4 +1]CFG | 0x0 | ROCST | 0x0 | pmp configuration bits | ++---------+----------------+---------------+--------+----------------+------------------------+ +| [23:16] | PMP[I*4 +2]CFG | 0x0 | ROCST | 0x0 | pmp configuration bits | ++---------+----------------+---------------+--------+----------------+------------------------+ +| [31:24] | PMP[I*4 +3]CFG | 0x0 | ROCST | 0x0 | pmp configuration bits | ++---------+----------------+---------------+--------+----------------+------------------------+ -:Address: 0x3b0-0x3ef + +.. .. _PMPADDR[0-7]::: +PMPADDR[0-7] +~~~~~~~~~~~~ + +:Address: 0x3b0-0x3b7 :Reset Value: 0x00000000 :Privilege: MRW :Description: Physical memory protection address register @@ -440,6 +466,22 @@ PMPADDR[0-15] +--------+--------------+---------------+--------+-------------------------+---------------------------------------------+ +.. .. _PMPADDR[8-63]::: +PMPADDR[8-63] +~~~~~~~~~~~~~ + +:Address: 0x3b8-0x3ef +:Reset Value: 0x00000000 +:Privilege: MRW +:Description: Physical memory protection address register + ++--------+--------------+---------------+--------+----------------+---------------------------------------------+ +| Bits | Field Name | Reset Value | Type | Legal Values | Description | ++========+==============+===============+========+================+=============================================+ +| [31:0] | PMPADDR[I] | 0x00000000 | ROCST | 0x0 | Physical memory protection address register | ++--------+--------------+---------------+--------+----------------+---------------------------------------------+ + + .. .. _ICACHE::: ICACHE ~~~~~~ @@ -523,7 +565,7 @@ MHPMCOUNTER[3-31] +--------+----------------+---------------+--------+----------------+---------------------------------------------------------------------------+ | Bits | Field Name | Reset Value | Type | Legal Values | Description | +========+================+===============+========+================+===========================================================================+ -| [31:0] | MHPMCOUNTER[I] | 0x00000000 | ROCST | 0x00000000 | The mhpmcounter is a 64-bit counter. Returns lower 32 bits in RV32I mode. | +| [31:0] | MHPMCOUNTER[I] | 0x00000000 | ROCST | 0x0 | The mhpmcounter is a 64-bit counter. Returns lower 32 bits in RV32I mode. | +--------+----------------+---------------+--------+----------------+---------------------------------------------------------------------------+ @@ -572,7 +614,7 @@ MHPMCOUNTER[3-31]H +--------+-----------------+---------------+--------+----------------+----------------------------------------------------------------+ | Bits | Field Name | Reset Value | Type | Legal Values | Description | +========+=================+===============+========+================+================================================================+ -| [31:0] | MHPMCOUNTER[I]H | 0x00000000 | ROCST | 0x00000000 | The mhpmcounterh returns the upper half word in RV32I systems. | +| [31:0] | MHPMCOUNTER[I]H | 0x00000000 | ROCST | 0x0 | The mhpmcounterh returns the upper half word in RV32I systems. | +--------+-----------------+---------------+--------+----------------+----------------------------------------------------------------+ @@ -589,7 +631,7 @@ MVENDORID +--------+--------------+---------------+--------+----------------+--------------------------------------------------------------------------------------------+ | Bits | Field Name | Reset Value | Type | Legal Values | Description | +========+==============+===============+========+================+============================================================================================+ -| [31:0] | MVENDORID | 0x00000602 | ROCST | 0x00000602 | 32-bit read-only register providing the JEDEC manufacturer ID of the provider of the core. | +| [31:0] | MVENDORID | 0x00000602 | ROCST | 0x602 | 32-bit read-only register providing the JEDEC manufacturer ID of the provider of the core. | +--------+--------------+---------------+--------+----------------+--------------------------------------------------------------------------------------------+ @@ -606,7 +648,7 @@ MARCHID +--------+--------------+---------------+--------+----------------+-------------------------------------------------------------------------------+ | Bits | Field Name | Reset Value | Type | Legal Values | Description | +========+==============+===============+========+================+===============================================================================+ -| [31:0] | MARCHID | 0x00000003 | ROCST | 0x00000003 | MXLEN-bit read-only register encoding the base microarchitecture of the hart. | +| [31:0] | MARCHID | 0x00000003 | ROCST | 0x3 | MXLEN-bit read-only register encoding the base microarchitecture of the hart. | +--------+--------------+---------------+--------+----------------+-------------------------------------------------------------------------------+ @@ -623,7 +665,7 @@ MIMPID +--------+--------------+---------------+--------+----------------+----------------------------------------------------------------------------+ | Bits | Field Name | Reset Value | Type | Legal Values | Description | +========+==============+===============+========+================+============================================================================+ -| [31:0] | MIMPID | 0x00000000 | ROCST | 0x00000000 | Provides a unique encoding of the version of the processor implementation. | +| [31:0] | MIMPID | 0x00000000 | ROCST | 0x0 | Provides a unique encoding of the version of the processor implementation. | +--------+--------------+---------------+--------+----------------+----------------------------------------------------------------------------+ @@ -640,7 +682,7 @@ MHARTID +--------+--------------+---------------+--------+----------------+-------------------------------------------------------------------------------------------------+ | Bits | Field Name | Reset Value | Type | Legal Values | Description | +========+==============+===============+========+================+=================================================================================================+ -| [31:0] | MHARTID | 0x00000000 | ROCST | 0x00000000 | MXLEN-bit read-only register containing the integer ID of the hardware thread running the code. | +| [31:0] | MHARTID | 0x00000000 | ROCST | 0x0 | MXLEN-bit read-only register containing the integer ID of the hardware thread running the code. | +--------+--------------+---------------+--------+----------------+-------------------------------------------------------------------------------------------------+ @@ -657,6 +699,6 @@ MCONFIGPTR +--------+--------------+---------------+--------+----------------+-------------------------------------------------------------------------------------------------+ | Bits | Field Name | Reset Value | Type | Legal Values | Description | +========+==============+===============+========+================+=================================================================================================+ -| [31:0] | MCONFIGPTR | 0x00000000 | ROCST | 0x00000000 | MXLEN-bit read-only register that holds the physical address of a configuration data structure. | +| [31:0] | MCONFIGPTR | 0x00000000 | ROCST | 0x0 | MXLEN-bit read-only register that holds the physical address of a configuration data structure. | +--------+--------------+---------------+--------+----------------+-------------------------------------------------------------------------------------------------+ diff --git a/config/gen_from_riscv_config/cv32a65x/isa/isa.adoc b/config/gen_from_riscv_config/cv32a65x/isa/isa.adoc new file mode 100644 index 0000000000..c187bcbbb6 --- /dev/null +++ b/config/gen_from_riscv_config/cv32a65x/isa/isa.adoc @@ -0,0 +1,223 @@ +//// + Copyright (c) 2024 OpenHW Group + Copyright (c) 2024 Thales + SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 + Author: Abdessamii Oukalrazqou +//// + +=== isa + +==== Instructions + +|=== +|Subset Name | Name | Description + +|I | RV32I Base Integer Instructions | the base integer instruction set, also known as the 'RV32I' or 'RV64I' instruction set , depending on the address space size, provides the core functionality required for general-purpose computing .it includes instructions for arithmetic, logical, and control operations, as well as memory accessand manipulation +|M | RV32M Multiplication and Division Instructions | the standard integer multiplication and division instruction extension, which is named “M” and contains instructions that multiply or divide values held in two integer registers. +|C | RV32C Compressed Instructions | RVC uses a simple compression scheme that offers shorter 16-bit versions of common 32-bit RISC-V instructions when: the immediate or address offset is small; one of the registers is the zero register (x0), the ABI link register (x1), or the ABI stack pointer (x2); the destination register and the first source register are identical; the registers used are the 8 most popular ones.The C extension is compatible with all other standard instruction extensions. The C extension allows 16-bit instructions to be freely intermixed with 32-bit instructions, with the latter now able to start on any 16-bit boundary. With the addition of the C extension, JAL and JALR instructions will no longer raise an instruction misaligned exception +|Zicsr | RV32Zicsr Control and Status Register Instructions | All CSR instructions atomically read-modify-write a single CSR, whose CSR specifier is encoded in the 12-bit csr field of the instruction held in bits 31–20. The immediate forms use a 5-bit zero-extended immediate encoded in the rs1 field. +|Zcb | RV32Zcb Code Size Reduction Instructions | Zcb belongs to the group of extensions called RISC-V Code Size Reduction Extension (Zc*). Zc* has become the superset of the Standard C extension adding more 16-bit instructions to the ISA. Zcb includes the 16-bit version of additional Integer (I), Multiply (M), and Bit-Manipulation (Zbb) Instructions. All the Zcb instructions require at least standard C extension support as a prerequisite, along with M and Zbb extensions for the 16-bit version of the respective instructions. +|Zba | RVZba Address generation instructions | The Zba instructions can be used to accelerate the generation of addresses that index into arrays of basic types (halfword, word, doubleword) using both unsigned word-sized and XLEN-sized indices: a shifted index is added to a base address. The shift and add instructions do a left shift of 1, 2, or 3 because these are commonly found in real-world code and because they can be implemented with a minimal amount of additional hardware beyond that of the simple adder. This avoids lengthening the critical path in implementations. While the shift and add instructions are limited to a maximum left shift of 3, the slli instruction (from the base ISA) can be used to perform similar shifts for indexing into arrays of wider elements. The slli.uw added in this extension can be used when the index is to be interpreted as an unsigned word. +|Zbb | RVZbb Basic bit-manipulation | The bit-manipulation (bitmanip) extension collection is comprised of several component extensions to the base RISC-V architecture that are intended to provide some combination of code size reduction, performance improvement, and energy reduction. While the instructions are intended to have general use, some instructions are more useful in some domains than others. Hence, several smaller bitmanip extensions are provided. Each of these smaller extensions is grouped by common function and use case, and each has its own Zb*-extension name. +|Zbc | RVZbc Carry-less multiplication | Carry-less multiplication is the multiplication in the polynomial ring over GF(2).clmul produces the lower half of the carry-less product and clmulh produces the upper half of the 2✕XLEN carry-less product.clmulr produces bits 2✕XLEN−2:XLEN-1 of the 2✕XLEN carry-less product. +|Zbs | RVZbs Single bit Instructions | The single-bit instructions provide a mechanism to set, clear, invert, or extract a single bit in a register. The bit is specified by its index. +|Zicntr | Zicntr | No info found yet for extension Zicntr +|=== + +==== RV32I Base Integer Instructions + +|=== +| Name | Format | Pseudocode|Invalid_values | Exception_raised | Description| Op Name + +| ADDI | addi rd, rs1, imm[11:0] | x[rd] = x[rs1] + sext(imm[11:0]) | NONE | NONE | add sign-extended 12-bit immediate to register rs1, and store the result in register rd. | Integer_Register_Immediate_Operations +| ANDI | andi rd, rs1, imm[11:0] | x[rd] = x[rs1] & sext(imm[11:0]) | NONE | NONE | perform bitwise AND on register rs1 and the sign-extended 12-bit immediate and place the result in rd. | Integer_Register_Immediate_Operations +| ORI | ori rd, rs1, imm[11:0] | x[rd] = x[rs1] \| sext(imm[11:0]) | NONE | NONE | perform bitwise OR on register rs1 and the sign-extended 12-bit immediate and place the result in rd. | Integer_Register_Immediate_Operations +| XORI | xori rd, rs1, imm[11:0] | x[rd] = x[rs1] ^ sext(imm[11:0]) | NONE | NONE | perform bitwise XOR on register rs1 and the sign-extended 12-bit immediate and place the result in rd. | Integer_Register_Immediate_Operations +| SLTI | slti rd, rs1, imm[11:0] | if (x[rs1] < sext(imm[11:0])) x[rd] = 1 else x[rd] = 0 | NONE | NONE | set register rd to 1 if register rs1 is less than the sign extended immediate when both are treated as signed numbers, else 0 is written to rd. | Integer_Register_Immediate_Operations +| SLTIU | sltiu rd, rs1, imm[11:0] | if (x[rs1] > imm[4:0] | NONE | NONE | logical right shift (zeros are shifted into the upper bits). | Integer_Register_Immediate_Operations +| SRAI | srai rd, rs1, imm[4:0] | x[rd] = x[rs1] >>s imm[4:0] | NONE | NONE | arithmetic right shift (the original sign bit is copied into the vacated upper bits). | Integer_Register_Immediate_Operations +| LUI | lui rd, imm[19:0] | x[rd] = sext(imm[31:12] << 12) | NONE | NONE | place the immediate value in the top 20 bits of the destination register rd, filling in the lowest 12 bits with zeros. | Integer_Register_Immediate_Operations +| AUIPC | auipc rd, imm[19:0] | x[rd] = pc + sext(immediate[31:12] << 12) | NONE | NONE | form a 32-bit offset from the 20-bit immediate, filling in the lowest 12 bits with zeros, adds this offset to the pc, then place the result in register rd. | Integer_Register_Immediate_Operations +| ADD | add rd, rs1, rs2 | x[rd] = x[rs1] + x[rs2] | NONE | NONE | add rs2 to register rs1, and store the result in register rd. | Integer_Register_Register_Operations +| SUB | sub rd, rs1, rs2 | x[rd] = x[rs1] - x[rs2] | NONE | NONE | subtract rs2 from register rs1, and store the result in register rd. | Integer_Register_Register_Operations +| AND | and rd, rs1, rs2 | x[rd] = x[rs1] & x[rs2] | NONE | NONE | perform bitwise AND on register rs1 and rs2 and place the result in rd. | Integer_Register_Register_Operations +| OR | or rd, rs1, rs2 | x[rd] = x[rs1] \| x[rs2] | NONE | NONE | perform bitwise OR on register rs1 and rs2 and place the result in rd. | Integer_Register_Register_Operations +| XOR | xor rd, rs1, rs2 | x[rd] = x[rs1] ^ x[rs2] | NONE | NONE | perform bitwise XOR on register rs1 and rs2 and place the result in rd. | Integer_Register_Register_Operations +| SLT | slt rd, rs1, rs2 | if (x[rs1] < x[rs2]) x[rd] = 1 else x[rd] = 0 | NONE | NONE | set register rd to 1 if register rs1 is less than rs2 when both are treated as signed numbers, else 0 is written to rd. | Integer_Register_Register_Operations +| SLTU | sltu rd, rs1, rs2 | if (x[rs1] > x[rs2] | NONE | NONE | logical right shift (zeros are shifted into the upper bits). | Integer_Register_Register_Operations +| SRA | sra rd, rs1, rs2 | x[rd] = x[rs1] >>s x[rs2] | NONE | NONE | arithmetic right shift (the original sign bit is copied into the vacated upper bits). | Integer_Register_Register_Operations +| JAL | jal rd, imm[20:1] | x[rd] = pc+4; pc += sext(imm[20:1]) | NONE | jumps to an unaligned address (4-byte or 2-byte boundary) will usually raise an exception. | offset is sign-extended and added to the pc to form the jump target address (pc is calculated using signed arithmetic), then setting the least-significant bit of the result to zero, and store the address of instruction following the jump (pc+4) into register rd. | Control_Transfer_Operations-Unconditional_Jumps +| JALR | jalr rd, rs1, imm[11:0] | t = pc+4; pc = (x[rs1]+sext(imm[11:0]))&∼1 ; x[rd] = t | NONE | jumps to an unaligned address (4-byte or 2-byte boundary) will usually raise an exception. | target address is obtained by adding the 12-bit signed immediate to the register rs1 (pc is calculated using signed arithmetic), then setting the least-significant bit of the result to zero, and store the address of instruction following the jump (pc+4) into register rd. | Control_Transfer_Operations-Unconditional_Jumps +| BEQ | beq rs1, rs2, imm[12:1] | if (x[rs1] == x[rs2]) pc += sext({imm[12:1], 1’b0}) else pc += 4 | NONE | no instruction fetch misaligned exception is generated for a conditional branch that is not taken. An Instruction address misaligned exception is raised if the target address is not aligned on 4-byte or 2-byte boundary, because the core supports compressed instructions. | takes the branch (pc is calculated using signed arithmetic) if registers rs1 and rs2 are equal. | Control_Transfer_Operations-Conditional_Branches +| BNE | bne rs1, rs2, imm[12:1] | if (x[rs1] != x[rs2]) pc += sext({imm[12:1], 1’b0}) else pc += 4 | NONE | no instruction fetch misaligned exception is generated for a conditional branch that is not taken. An Instruction address misaligned exception is raised if the target address is not aligned on 4-byte or 2-byte boundary, because the core supports compressed instructions. | takes the branch (pc is calculated using signed arithmetic) if registers rs1 and rs2 are not equal. | Control_Transfer_Operations-Conditional_Branches +| BLT | blt rs1, rs2, imm[12:1] | if (x[rs1] < x[rs2]) pc += sext({imm[12:1], 1’b0}) else pc += 4 | NONE | no instruction fetch misaligned exception is generated for a conditional branch that is not taken. An Instruction address misaligned exception is raised if the target address is not aligned on 4-byte or 2-byte boundary, because the core supports compressed instructions. | takes the branch (pc is calculated using signed arithmetic) if registers rs1 less than rs2 (using signed comparison). | Control_Transfer_Operations-Conditional_Branches +| BLTU | bltu rs1, rs2, imm[12:1] | if (x[rs1] = x[rs2]) pc += sext({imm[12:1], 1’b0}) else pc += 4 | NONE | no instruction fetch misaligned exception is generated for a conditional branch that is not taken. An Instruction address misaligned exception is raised if the target address is not aligned on 4-byte or 2-byte boundary, because the core supports compressed instructions. | takes the branch (pc is calculated using signed arithmetic) if registers rs1 is greater than or equal rs2 (using signed comparison). | Control_Transfer_Operations-Conditional_Branches +| BGEU | bgeu rs1, rs2, imm[12:1] | if (x[rs1] >=u x[rs2]) pc += sext({imm[12:1], 1’b0}) else pc += 4 | NONE | no instruction fetch misaligned exception is generated for a conditional branch that is not taken. An Instruction address misaligned exception is raised if the target address is not aligned on 4-byte or 2-byte boundary, because the core supports compressed instructions. | takes the branch (pc is calculated using signed arithmetic) if registers rs1 is greater than or equal rs2 (using unsigned comparison). | Control_Transfer_Operations-Conditional_Branches +| LB | lb rd, imm(rs1) | x[rd] = sext(M[x[rs1] + sext(imm[11:0])][7:0]) | NONE | loads with a destination of x0 must still raise any exceptions and action any other side effects even though the load value is discarded. | loads a 8-bit value from memory, then sign-extends to 32-bit before storing in rd (rd is calculated using signed arithmetic), the effective address is obtained by adding register rs1 to the sign-extended 12-bit offset. | Load_and_Store_Instructions +| LH | lh rd, imm(rs1) | x[rd] = sext(M[x[rs1] + sext(imm[11:0])][15:0]) | NONE | loads with a destination of x0 must still raise any exceptions and action any other side effects even though the load value is discarded, also an exception is raised if the memory address isn't aligned (2-byte boundary). | loads a 16-bit value from memory, then sign-extends to 32-bit before storing in rd (rd is calculated using signed arithmetic), the effective address is obtained by adding register rs1 to the sign-extended 12-bit offset. | Load_and_Store_Instructions +| LW | lw rd, imm(rs1) | x[rd] = sext(M[x[rs1] + sext(imm[11:0])][31:0]) | NONE | loads with a destination of x0 must still raise any exceptions and action any other side effects even though the load value is discarded, also an exception is raised if the memory address isn't aligned (4-byte boundary). | loads a 32-bit value from memory, then storing in rd (rd is calculated using signed arithmetic). The effective address is obtained by adding register rs1 to the sign-extended 12-bit offset. | Load_and_Store_Instructions +| LBU | lbu rd, imm(rs1) | x[rd] = zext(M[x[rs1] + sext(imm[11:0])][7:0]) | NONE | loads with a destination of x0 must still raise any exceptions and action any other side effects even though the load value is discarded. | loads a 8-bit value from memory, then zero-extends to 32-bit before storing in rd (rd is calculated using unsigned arithmetic), the effective address is obtained by adding register rs1 to the sign-extended 12-bit offset. | Load_and_Store_Instructions +| LHU | lhu rd, imm(rs1) | x[rd] = zext(M[x[rs1] + sext(imm[11:0])][15:0]) | NONE | loads with a destination of x0 must still raise any exceptions and action any other side effects even though the load value is discarded, also an exception is raised if the memory address isn't aligned (2-byte boundary). | loads a 16-bit value from memory, then zero-extends to 32-bit before storing in rd (rd is calculated using unsigned arithmetic), the effective address is obtained by adding register rs1 to the sign-extended 12-bit offset. | Load_and_Store_Instructions +| SB | sb rs2, imm(rs1) | M[x[rs1] + sext(imm[11:0])][7:0] = x[rs2][7:0] | NONE | NONE | stores a 8-bit value from the low bits of register rs2 to memory, the effective address is obtained by adding register rs1 to the sign-extended 12-bit offset. | Load_and_Store_Instructions +| SH | sh rs2, imm(rs1) | M[x[rs1] + sext(imm[11:0])][15:0] = x[rs2][15:0] | NONE | an exception is raised if the memory address isn't aligned (2-byte boundary). | stores a 16-bit value from the low bits of register rs2 to memory, the effective address is obtained by adding register rs1 to the sign-extended 12-bit offset. | Load_and_Store_Instructions +| SW | sw rs2, imm(rs1) | M[x[rs1] + sext(imm[11:0])][31:0] = x[rs2][31:0] | NONE | an exception is raised if the memory address isn't aligned (4-byte boundary). | stores a 32-bit value from register rs2 to memory, the effective address is obtained by adding register rs1 to the sign-extended 12-bit offset. | Load_and_Store_Instructions +| FENCE | fence pre, succ | No operation (nop) | NONE | NONE | order device I/O and memory accesses as viewed by other RISC-V harts and external devices or coprocessors. Any combination of device input (I), device output (O), memory reads (R), and memory writes (W) may be ordered with respect to any combination of the same. Informally, no other RISC-V hart or external device can observe any operation in the successor set following a FENCE before any operation in the predecessor set preceding the FENCE, as the core support 1 hart, the fence instruction has no effect so we can considerate it as a nop instruction. | Memory_Ordering +| ECALL | ecall | RaiseException(EnvironmentCall) | NONE | Raise an Environment Call exception. | make a request to the supporting execution environment, which is usually an operating system. The ABI for the system will define how parameters for the environment request are passed, but usually these will be in defined locations in the integer register file. | Environment_Call_and_Breakpoints +| EBREAK | ebreak | x[8 + rd'] = sext(x[8 + rd'][7:0]) | NONE | NONE | This instruction takes a single source/destination operand. It sign-extends the least-significant byte in the operand by copying the most-significant bit in the byte (i.e., bit 7) to all of the more-significant bits. It also requires Bit-Manipulation (Zbb) extension support. | Environment_Call_and_Breakpoints +|=== + +==== RV32M Multiplication and Division Instructions + +|=== +| Name | Format | Pseudocode|Invalid_values | Exception_raised | Description| Op Name + +| MUL | mul rd, rs1, rs2 | x[rd] = x[rs1] * x[rs2] | NONE | NONE | performs a 32-bit × 32-bit multiplication and places the lower 32 bits in the destination register (Both rs1 and rs2 treated as signed numbers). | Multiplication Operations +| MULH | mulh rd, rs1, rs2 | x[rd] = (x[rs1] s*s x[rs2]) >>s 32 | NONE | NONE | performs a 32-bit × 32-bit multiplication and places the upper 32 bits in the destination register of the 64-bit product (Both rs1 and rs2 treated as signed numbers). | Multiplication Operations +| MULHU | mulhu rd, rs1, rs2 | x[rd] = (x[rs1] u*u x[rs2]) >>u 32 | NONE | NONE | performs a 32-bit × 32-bit multiplication and places the upper 32 bits in the destination register of the 64-bit product (Both rs1 and rs2 treated as unsigned numbers). | Multiplication Operations +| MULHSU | mulhsu rd, rs1, rs2 | x[rd] = (x[rs1] s*u x[rs2]) >>s 32 | NONE | NONE | performs a 32-bit × 32-bit multiplication and places the upper 32 bits in the destination register of the 64-bit product (rs1 treated as signed number, rs2 treated as unsigned number). | Multiplication Operations +| DIV | div rd, rs1, rs2 | x[rd] = x[rs1] /s x[rs2] | NONE | NONE | perform signed integer division of 32 bits by 32 bits (rounding towards zero). | Division Operations +| DIVU | divu rd, rs1, rs2 | x[rd] = x[rs1] /u x[rs2] | NONE | NONE | perform unsigned integer division of 32 bits by 32 bits (rounding towards zero). | Division Operations +| REM | rem rd, rs1, rs2 | x[rd] = x[rs1] %s x[rs2] | NONE | NONE | provide the remainder of the corresponding division operation DIV (the sign of rd equals the sign of rs1). | Division Operations +| REMU | rem rd, rs1, rs2 | x[rd] = x[rs1] %u x[rs2] | NONE | NONE | provide the remainder of the corresponding division operation DIVU. | Division Operations +|=== + +==== RV32C Compressed Instructions + +|=== +| Name | Format | Pseudocode|Invalid_values | Exception_raised | Description| Op Name + +| C.LI | c.li rd, imm[5:0] | x[rd] = sext(imm[5:0]) | rd = x0 | NONE | loads the sign-extended 6-bit immediate, imm, into register rd. | Integer Computational Instructions +| C.LUI | c.lui rd, nzimm[17:12] | x[rd] = sext(nzimm[17:12] << 12) | rd = x0 & rd = x2 & nzimm = 0 | NONE | loads the non-zero 6-bit immediate field into bits 17–12 of the destination register, clears the bottom 12 bits, and sign-extends bit 17 into all higher bits of the destination. | Integer Computational Instructions +| C.ADDI | c.addi rd, nzimm[5:0] | x[rd] = x[rd] + sext(nzimm[5:0]) | rd = x0 & nzimm = 0 | NONE | adds the non-zero sign-extended 6-bit immediate to the value in register rd then writes the result to rd. | Integer Computational Instructions +| C.ADDI16SP | c.addi16sp nzimm[9:4] | x[2] = x[2] + sext(nzimm[9:4]) | rd != x2 & nzimm = 0 | NONE | adds the non-zero sign-extended 6-bit immediate to the value in the stack pointer (sp=x2), where the immediate is scaled to represent multiples of 16 in the range (-512,496). C.ADDI16SP is used to adjust the stack pointer in procedure prologues and epilogues. C.ADDI16SP shares the opcode with C.LUI, but has a destination field of x2. | Integer Computational Instructions +| C.ADDI4SPN | c.addi4spn rd', nzimm[9:2] | x[8 + rd'] = x[2] + zext(nzimm[9:2]) | nzimm = 0 | NONE | adds a zero-extended non-zero immediate, scaled by 4, to the stack pointer, x2, and writes the result to rd'. This instruction is used to generate pointers to stack-allocated variables. | Integer Computational Instructions +| C.SLLI | c.slli rd, uimm[5:0] | x[rd] = x[rd] << uimm[5:0] | rd = x0 & uimm[5] = 0 | NONE | performs a logical left shift (zeros are shifted into the lower bits). | Integer Computational Instructions +| C.SRLI | c.srli rd', uimm[5:0] | x[8 + rd'] = x[8 + rd'] >> uimm[5:0] | uimm[5] = 0 | NONE | performs a logical right shift (zeros are shifted into the upper bits). | Integer Computational Instructions +| C.SRAI | c.srai rd', uimm[5:0] | x[8 + rd'] = x[8 + rd'] >>s uimm[5:0] | uimm[5] = 0 | NONE | performs an arithmetic right shift (sign bits are shifted into the upper bits). | Integer Computational Instructions +| C.ANDI | c.andi rd', imm[5:0] | x[8 + rd'] = x[8 + rd'] & sext(imm[5:0]) | NONE | NONE | computes the bitwise AND of the value in register rd', and the sign-extended 6-bit immediate, then writes the result to rd'. | Integer Computational Instructions +| C.ADD | c.add rd, rs2 | x[rd] = x[rd] + x[rs2] | rd = x0 & rs2 = x0 | NONE | adds the values in registers rd and rs2 and writes the result to register rd. | Integer Computational Instructions +| C.MV | c.mv rd, rs2 | x[rd] = x[rs2] | rd = x0 & rs2 = x0 | NONE | copies the value in register rs2 into register rd. | Integer Computational Instructions +| C.AND | c.and rd', rs2' | x[8 + rd'] = x[8 + rd'] & x[8 + rs2'] | NONE | NONE | computes the bitwise AND of of the value in register rd', and register rs2', then writes the result to rd'. | Integer Computational Instructions +| C.OR | c.or rd', rs2' | x[8 + rd'] = x[8 + rd'] \| x[8 + rs2'] | NONE | NONE | computes the bitwise OR of of the value in register rd', and register rs2', then writes the result to rd'. | Integer Computational Instructions +| C.XOR | c.and rd', rs2' | x[8 + rd'] = x[8 + rd'] ^ x[8 + rs2'] | NONE | NONE | computes the bitwise XOR of of the value in register rd', and register rs2', then writes the result to rd'. | Integer Computational Instructions +| C.SUB | c.sub rd', rs2' | x[8 + rd'] = x[8 + rd'] - x[8 + rs2'] | NONE | NONE | subtracts the value in registers rs2' from value in rd' and writes the result to register rd'. | Integer Computational Instructions +| C.EBREAK | c.ebreak | RaiseException(Breakpoint) | NONE | Raise a Breakpoint exception. | cause control to be transferred back to the debugging environment. | Integer Computational Instructions +| C.J | c.j imm[11:1] | pc += sext(imm[11:1]) | NONE | jumps to an unaligned address (4-byte or 2-byte boundary) will usually raise an exception. | performs an unconditional control transfer. The offset is sign-extended and added to the pc to form the jump target address. | Control Transfer Instructions +| C.JAL | c.jal imm[11:1] | x[1] = pc+2; pc += sext(imm[11:1]) | NONE | jumps to an unaligned address (4-byte or 2-byte boundary) will usually raise an exception. | performs the same operation as C.J, but additionally writes the address of the instruction following the jump (pc+2) to the link register, x1. | Control Transfer Instructions +| C.JR | c.jr rs1 | pc = x[rs1] | rs1 = x0 | jumps to an unaligned address (4-byte or 2-byte boundary) will usually raise an exception. | performs an unconditional control transfer to the address in register rs1. | Control Transfer Instructions +| C.JALR | c.jalr rs1 | t = pc+2; pc = x[rs1]; x[1] = t | rs1 = x0 | jumps to an unaligned address (4-byte or 2-byte boundary) will usually raise an exception. | performs the same operation as C.JR, but additionally writes the address of the instruction following the jump (pc+2) to the link register, x1. | Control Transfer Instructions +| C.BEQZ | c.beqz rs1', imm[8:1] | if (x[8+rs1'] == 0) pc += sext(imm[8:1]) | NONE | no instruction fetch misaligned exception is generated for a conditional branch that is not taken. An Instruction address misaligned exception is raised if the target address is not aligned on 4-byte or 2-byte boundary, because the core supports compressed instructions. | performs conditional control transfers. The offset is sign-extended and added to the pc to form the branch target address. C.BEQZ takes the branch if the value in register rs1' is zero. | Control Transfer Instructions +| C.BNEZ | c.bnez rs1', imm[8:1] | if (x[8+rs1'] != 0) pc += sext(imm[8:1]) | NONE | no instruction fetch misaligned exception is generated for a conditional branch that is not taken. An Instruction address misaligned exception is raised if the target address is not aligned on 4-byte or 2-byte boundary, because the core supports compressed instructions. | performs conditional control transfers. The offset is sign-extended and added to the pc to form the branch target address. C.BEQZ takes the branch if the value in register rs1' isn't zero. | Control Transfer Instructions +| C.LWSP | c.lwsp rd, uimm(x2) | x[rd] = M[x[2] + zext(uimm[7:2])][31:0] | rd = x0 | loads with a destination of x0 must still raise any exceptions, also an exception if the memory address isn't aligned (4-byte boundary). | loads a 32-bit value from memory into register rd. It computes an effective address by adding the zero-extended offset, scaled by 4, to the stack pointer, x2. | Load and Store Instructions +| C.SWSP | c.swsp rd, uimm(x2) | M[x[2] + zext(uimm[7:2])][31:0] = x[rs2] | NONE | an exception raised if the memory address isn't aligned (4-byte boundary). | stores a 32-bit value in register rs2 to memory. It computes an effective address by adding the zero-extended offset, scaled by 4, to the stack pointer, x2. | Load and Store Instructions +| C.LW | c.lw rd', uimm(rs1') | x[8+rd'] = M[x[8+rs1'] + zext(uimm[6:2])][31:0]) | NONE | an exception raised if the memory address isn't aligned (4-byte boundary). | loads a 32-bit value from memory into register rd'. It computes an effective address by adding the zero-extended offset, scaled by 4, to the base address in register rs1'. | Load and Store Instructions +| C.SW | c.sw rs2', uimm(rs1') | M[x[8+rs1'] + zext(uimm[6:2])][31:0] = x[8+rs2'] | NONE | an exception raised if the memory address isn't aligned (4-byte boundary). | stores a 32-bit value from memory into register rd'. It computes an effective address by adding the zero-extended offset, scaled by 4, to the base address in register rs1'. | Load and Store Instructions +|=== + +==== RV32Zicsr Control and Status Register Instructions + +|=== +| Name | Format | Pseudocode|Invalid_values | Exception_raised | Description| Op Name + +| CSRRW | csrrw rd, csr, rs1 | t = CSRs[csr]; CSRs[csr] = x[rs1]; x[rd] = t | NONE | Attempts to access a non-existent CSR raise an illegal instruction exception. Attempts to access a CSR without appropriate privilege level or to write a read-only register also raise illegal instruction exceptions. | Reads the old value of the CSR, zero-extends the value to 32 bits, then writes it to integer register rd. The initial value in rs1 is written to the CSR. If rd=x0, then the instruction shall not read the CSR and shall not cause any of the side-effects that might occur on a CSR read. | Control and Status Register Operations +| CSRRS | csrrs rd, csr, rs1 | t = CSRs[csr]; CSRs[csr] = t \| x[rs1]; x[rd] = t | NONE | Attempts to access a non-existent CSR raise an illegal instruction exception. Attempts to access a CSR without appropriate privilege level or to write a read-only register also raise illegal instruction exceptions. | Reads the value of the CSR, zero-extends the value to 32 bits, and writes it to integer register rd. The initial value in integer register rs1 is treated as a bit mask that specifies bit positions to be set in the CSR. Any bit that is high in rs1 will cause the corresponding bit to be set in the CSR, if that CSR bit is writable. Other bits in the CSR are unaffected (though CSRs might have side effects when written). If rs1=x0, then the instruction will not write to the CSR at all, and so shall not cause any of the side effects that might otherwise occur on a CSR write, such as raising illegal instruction exceptions on accesses to read-only CSRs. | Control and Status Register Operations +| CSRRC | csrrc rd, csr, rs1 | t = CSRs[csr]; CSRs[csr] = t & ∼x[rs1]; x[rd] = t | NONE | Attempts to access a non-existent CSR raise an illegal instruction exception. Attempts to access a CSR without appropriate privilege level or to write a read-only register also raise illegal instruction exceptions. | Reads the value of the CSR, zero-extends the value to 32 bits, and writes it to integer register rd. The initial value in integer register rs1 is treated as a bit mask that specifies bit positions to be cleared in the CSR. Any bit that is high in rs1 will cause the corresponding bit to be set in the CSR, if that CSR bit is writable. Other bits in the CSR are unaffected (though CSRs might have side effects when written). If rs1=x0, then the instruction will not write to the CSR at all, and so shall not cause any of the side effects that might otherwise occur on a CSR write, such as raising illegal instruction exceptions on accesses to read-only CSRs. | Control and Status Register Operations +| CSRRWI | csrrwi rd, csr, uimm[4:0] | x[rd] = CSRs[csr]; CSRs[csr] = zext(uimm[4:0]) | NONE | Attempts to access a non-existent CSR raise an illegal instruction exception. Attempts to access a CSR without appropriate privilege level or to write a read-only register also raise illegal instruction exceptions. | Reads the old value of the CSR, zero-extends the value to 32 bits, then writes it to integer register rd. The zero-extends immediate is written to the CSR. If rd=x0, then the instruction shall not read the CSR and shall not cause any of the side-effects that might occur on a CSR read. | Control and Status Register Operations +| CSRRSI | csrrsi rd, csr, uimm[4:0] | t = CSRs[csr]; CSRs[csr] = t \| zext(uimm[4:0]); x[rd] = t | NONE | Attempts to access a non-existent CSR raise an illegal instruction exception. Attempts to access a CSR without appropriate privilege level or to write a read-only register also raise illegal instruction exceptions. | Reads the value of the CSR, zero-extends the value to 32 bits, and writes it to integer register rd. The zero-extends immediate value is treated as a bit mask that specifies bit positions to be set in the CSR. Any bit that is high in zero-extends immediate will cause the corresponding bit to be set in the CSR, if that CSR bit is writable. Other bits in the CSR are unaffected (though CSRs might have side effects when written). If the uimm[4:0] field is zero, then these instructions will not write to the CSR, and shall not cause any of the side effects that might otherwise occur on a CSR write. | Control and Status Register Operations +| CSRRCI | csrrci rd, csr, uimm[4:0] | t = CSRs[csr]; CSRs[csr] = t & ∼zext(uimm[4:0]); x[rd] = t | NONE | Attempts to access a non-existent CSR raise an illegal instruction exception. Attempts to access a CSR without appropriate privilege level or to write a read-only register also raise illegal instruction exceptions. | Reads the value of the CSR, zero-extends the value to 32 bits, and writes it to integer register rd. The zero-extends immediate value is treated as a bit mask that specifies bit positions to be cleared in the CSR. Any bit that is high in zero-extends immediate will cause the corresponding bit to be set in the CSR, if that CSR bit is writable. Other bits in the CSR are unaffected (though CSRs might have side effects when written). If the uimm[4:0] field is zero, then these instructions will not write to the CSR, and shall not cause any of the side effects that might otherwise occur on a CSR write. | Control and Status Register Operations +|=== + +==== RV32Zcb Code Size Reduction Instructions + +|=== +| Name | Format | Pseudocode|Invalid_values | Exception_raised | Description| Op Name + +| C.ZEXT.B | c.zext.b rd' | x[8 + rd'] = zext(x[8 + rd'][7:0]) | NONE | NONE | This instruction takes a single source/destination operand. It zero-extends the least-significant byte of the operand by inserting zeros into all of the bits more significant than 7. | Code Size Reduction Operations +| C.SEXT.B | c.sext.b rd' | x[8 + rd'] = sext(x[8 + rd'][7:0]) | NONE | NONE | This instruction takes a single source/destination operand. It sign-extends the least-significant byte in the operand by copying the most-significant bit in the byte (i.e., bit 7) to all of the more-significant bits. It also requires Bit-Manipulation (Zbb) extension support. | Code Size Reduction Operations +| C.ZEXT.H | c.zext.h rd' | x[8 + rd'] = zext(x[8 + rd'][15:0]) | NONE | NONE | This instruction takes a single source/destination operand. It zero-extends the least-significant halfword of the operand by inserting zeros into all of the bits more significant than 15. It also requires Bit-Manipulation (Zbb) extension support. | Code Size Reduction Operations +| C.SEXT.H | c.sext.h rd' | x[8 + rd'] = sext(x[8 + rd'][15:0]) | NONE | NONE | This instruction takes a single source/destination operand. It sign-extends the least-significant halfword in the operand by copying the most-significant bit in the halfword (i.e., bit 15) to all of the more-significant bits. It also requires Bit-Manipulation (Zbb) extension support. | Code Size Reduction Operations +| C.NOT | c.not rd' | x[8 + rd'] = x[8 + rd'] ^ -1 | NONE | NONE | This instruction takes the one’s complement of rd'/rs1' and writes the result to the same register. | Code Size Reduction Operations +| C.MUL | c.mul rd', rs2' | x[8 + rd'] = (x[8 + rd'] * x[8 + rs2'])[31:0] | NONE | NONE | performs a 32-bit × 32-bit multiplication and places the lower 32 bits in the destination register (Both rd' and rs2' treated as signed numbers). It also requires M extension support. | Code Size Reduction Operations +| C.LHU | c.lhu rd', uimm(rs1') | x[8+rd'] = zext(M[x[8+rs1'] + zext(uimm[1])][15:0]) | NONE | an exception raised if the memory address isn't aligned (2-byte boundary). | This instruction loads a halfword from the memory address formed by adding rs1' to the zero extended immediate uimm. The resulting halfword is zero extended and is written to rd'. | Code Size Reduction Operations +| C.LH | c.lh rd', uimm(rs1') | x[8+rd'] = sext(M[x[8+rs1'] + zext(uimm[1])][15:0]) | NONE | an exception raised if the memory address isn't aligned (2-byte boundary). | This instruction loads a halfword from the memory address formed by adding rs1' to the zero extended immediate uimm. The resulting halfword is sign extended and is written to rd'. | Code Size Reduction Operations +| C.LBU | c.lbu rd', uimm(rs1') | x[8+rd'] = zext(M[x[8+rs1'] + zext(uimm[1:0])][7:0]) | NONE | NONE | This instruction loads a byte from the memory address formed by adding rs1' to the zero extended immediate uimm. The resulting byte is zero extended and is written to rd'. | Code Size Reduction Operations +| C.SH | c.sh rs2', uimm(rs1') | M[x[8+rs1'] + zext(uimm[1])][15:0] = x[8+rs2'] | NONE | an exception raised if the memory address isn't aligned (2-byte boundary). | This instruction stores the least significant halfword of rs2' to the memory address formed by adding rs1' to the zero extended immediate uimm. | Code Size Reduction Operations +| C.SB | c.sb rs2', uimm(rs1') | M[x[8+rs1'] + zext(uimm[1:0])][7:0] = x[8+rs2'] | NONE | NONE | This instruction stores the least significant byte of rs2' to the memory address formed by adding rs1' to the zero extended immediate uimm. | Code Size Reduction Operations +|=== + +==== RVZba Address generation instructions + +|=== +| Name | Format | Pseudocode|Invalid_values | Exception_raised | Description| Op Name + +| ADD.UW | add.uw rd, rs1, rs2 | X(rd) = rs2 + EXTZ(X(rs1)[31..0]) | NONE | NONE | This instruction performs an XLEN-wide addition between rs2 and the zero-extended least-significant word of rs1. | Address generation instructions +| SH1ADD | sh1add rd, rs1, rs2 | X(rd) = X(rs2) + (X(rs1) << 1) | NONE | NONE | This instruction shifts rs1 to the left by 1 bit and adds it to rs2. | Address generation instructions +| SH1ADD.UW | sh1add.uw rd, rs1, rs2 | X(rd) = rs2 + (EXTZ(X(rs1)[31..0]) << 1) | NONE | NONE | This instruction performs an XLEN-wide addition of two addends. The first addend is rs2. The second addend is the unsigned value formed by extracting the least-significant word of rs1 and shifting it left by 1 place. | Address generation instructions +| SH2ADD | sh2add rd, rs1, rs2 | X(rd) = X(rs2) + (X(rs1) << 2) | NONE | NONE | This instruction shifts rs1 to the left by 2 bit and adds it to rs2. | Address generation instructions +| SH2ADD.UW | sh2add.uw rd, rs1, rs2 | X(rd) = rs2 + (EXTZ(X(rs1)[31..0]) << 2) | NONE | NONE | This instruction performs an XLEN-wide addition of two addends. The first addend is rs2. The second addend is the unsigned value formed by extracting the least-significant word of rs1 and shifting it left by 2 places. | Address generation instructions +| SH3ADD | sh3add rd, rs1, rs2 | X(rd) = X(rs2) + (X(rs1) << 3) | NONE | NONE | This instruction shifts rs1 to the left by 3 bit and adds it to rs2. | Address generation instructions +| SH3ADD.UW | sh3add.uw rd, rs1, rs2 | X(rd) = rs2 + (EXTZ(X(rs1)[31..0]) << 3) | NONE | NONE | This instruction performs an XLEN-wide addition of two addends. The first addend is rs2. The second addend is the unsigned value formed by extracting the least-significant word of rs1 and shifting it left by 3 places. | Address generation instructions +| SLLI.UW | slli.uw rd, rs1, imm | X(rd) = (EXTZ(X(rs)[31..0]) << imm) | NONE | NONE | This instruction takes the least-significant word of rs1, zero-extends it, and shifts it left by the immediate. | Address generation instructions +|=== + +==== RVZbb Basic bit-manipulation + +|=== +| Name | Format | Pseudocode|Invalid_values | Exception_raised | Description| Op Name + +| ANDN | andn rd, rs1, rs2 | X(rd) = X(rs1) & ~X(rs2) | NONE | NONE | Performs bitwise AND operation between rs1 and bitwise inversion of rs2. | Logical_with_negate +| ORN | orn rd, rs1, rs2 | X(rd) = X(rs1) \| ~X(rs2) | NONE | NONE | Performs bitwise OR operation between rs1 and bitwise inversion of rs2. | Logical_with_negate +| XNOR | xnor rd, rs1, rs2 | X(rd) = ~(X(rs1) ^ X(rs2)) | NONE | NONE | Performs bitwise XOR operation between rs1 and rs2, then complements the result. | Logical_with_negate +| CLZ | clz rd, rs | if [x[i]] == 1 then return(i) else return -1 | NONE | NONE | Counts leading zero bits in rs. | Count_leading_trailing_zero_bits +| CTZ | ctz rd, rs | if [x[i]] == 1 then return(i) else return xlen; | NONE | NONE | Counts trailing zero bits in rs. | Count_leading_trailing_zero_bits +| CLZW | clzw rd, rs | if [x[i]] == 1 then return(i) else return -1 | NONE | NONE | Counts leading zero bits in the least-significant word of rs. | Count_leading_trailing_zero_bits +| CTZW | ctzw rd, rs | if [x[i]] == 1 then return(i) else return 32; | NONE | NONE | Counts trailing zero bits in the least-significant word of rs. | Count_leading_trailing_zero_bits +| CPOP | cpop rd, rs | if rs[i] == 1 then bitcount = bitcount + 1 else () | NONE | NONE | Counts set bits in rs. | Count_population +| CPOPW | cpopw rd, rs | if rs[i] == 0b1 then bitcount = bitcount + 1 else () | NONE | NONE | Counts set bits in the least-significant word of rs. | Count_population +| MAX | max rd, rs1, rs2 | if rs1_val <_s rs2_val then rs2_val else rs1_val | NONE | NONE | Returns the larger of two signed integers. | Integer_minimum_maximum +| MAXU | maxu rd, rs1, rs2 | if rs1_val <_u rs2_val then rs2_val else rs1_val | NONE | NONE | Returns the larger of two unsigned integers. | Integer_minimum_maximum +| MIN | min rd, rs1, rs2 | if rs1_val <_s rs2_val then rs1_val else rs2_val | NONE | NONE | Returns the smaller of two signed integers. | Integer_minimum_maximum +| MINU | minu rd, rs1, rs2 | if rs1_val <_u rs2_val then rs1_val else rs2_val | NONE | NONE | Returns the smaller of two unsigned integers. | Integer_minimum_maximum +| SEXT.B | sext.b rd, rs | X(rd) = EXTS(X(rs)[7..0]) | NONE | NONE | Sign-extends the least-significant byte in the source to XLEN. | Sign_and_zero_extension +| SEXT.H | sext.h rd, rs | X(rd) = EXTS(X(rs)[15..0]) | NONE | NONE | Sign-extends the least-significant halfword in rs to XLEN. | Sign_and_zero_extension +| ZEXT.H | zext.h rd, rs | X(rd) = EXTZ(X(rs)[15..0]) | NONE | NONE | Zero-extends the least-significant halfword of the source to XLEN. | Sign_and_zero_extension +| ROL | rol rd, rs1, rs2 | (X(rs1) << log2(XLEN)) \| (X(rs1) >> (xlen - log2(XLEN))) | NONE | NONE | Performs a rotate left of rs1 by the amount in least-significant log2(XLEN) bits of rs2. | Bitwise_rotation +| ROR | ror rd, rs1, rs2 | (X(rs1) >> log2(XLEN)) \| (X(rs1) << (xlen - log2(XLEN))) | NONE | NONE | Performs a rotate right of rs1 by the amount in least-significant log2(XLEN) bits of rs2. | Bitwise_rotation +| RORI | rori rd, rs1, shamt | (X(rs1) >> log2(XLEN)) \| (X(rs1) << (xlen - log2(XLEN))) | NONE | NONE | Performs a rotate right of rs1 by the amount in least-significant log2(XLEN) bits of shamt. | Bitwise_rotation +| ROLW | rolw rd, rs1, rs2 | EXTS((rs1 << X(rs2)[4..0]) \| (rs1 >> (32 - X(rs2)[4..0]))) | NONE | NONE | Performs a rotate left on the least-significant word of rs1 by the amount in least-significant 5 bits of rs2. | Bitwise_rotation +| RORIW | roriw rd, rs1, shamt | (rs1_data >> shamt[4..0]) \| (rs1_data << (32 - shamt[4..0])) | NONE | NONE | Performs a rotate right on the least-significant word of rs1 by the amount in least-significant log2(XLEN) bits of shamt. | Bitwise_rotation +| RORW | rorw rd, rs1, rs2 | (rs1 >> X(rs2)[4..0]) \| (rs1 << (32 - X(rs2)[4..0])) | NONE | NONE | Performs a rotate right on the least-significant word of rs1 by the amount in least-significant 5 bits of rs2. | Bitwise_rotation +| ORC.b | orc.b rd, rs | if { input[(i + 7)..i] == 0 then 0b00000000 else 0b11111111 | NONE | NONE | Sets the bits of each byte in rd to all zeros if no bit within the respective byte of rs is set, or to all ones if any bit within the respective byte of rs is set. | OR_Combine +| REV8 | rev8 rd, rs | output[i..(i + 7)] = input[(j - 7)..j] | NONE | NONE | Reverses the order of the bytes in rs. | Byte_reverse +|=== + +==== RVZbc Carry-less multiplication + +|=== +| Name | Format | Pseudocode|Invalid_values | Exception_raised | Description| Op Name + +| CLMUL | clmul rd, rs1, rs2 | foreach (i from 1 to xlen by 1) { output = if ((rs2 >> i) & 1) then output ^ (rs1 << i); else output;} | NONE | NONE | clmul produces the lower half of the 2.XLEN carry-less product. | Carry-less multiplication Operations +| CLMULH | clmulh rd, rs1, rs2 | foreach (i from 1 to xlen by 1) { output = if ((rs2_val >> i) & 1) then output ^ (rs1_val >> (xlen - i)) else output} | NONE | NONE | clmulh produces the upper half of the 2.XLEN carry-less product. | Carry-less multiplication Operations +| CLMULR | clmulr rd, rs1, rs2 | foreach (i from 0 to (xlen - 1) by 1) { output = if ((rs2_val >> i) & 1) then output ^ (rs1_val >> (xlen - i - 1)) else output} | NONE | NONE | clmulr produces bits 2.XLEN-2:XLEN-1 of the 2.XLEN carry-less product. | Carry-less multiplication Operations +|=== + +==== RVZbs Single bit Instructions + +|=== +| Name | Format | Pseudocode|Invalid_values | Exception_raised | Description| Op Name + +| BCLR | bclr rd, rs1, rs2 | X(rd) = X(rs1) & ~(1 << (X(rs2) & (XLEN - 1))) | NONE | NONE | This instruction returns rs1 with a single bit cleared at the index specified in rs2. The index is read from the lower log2(XLEN) bits of rs2. | Single_bit_Operations +| BCLRI | bclri rd, rs1, shamt | X(rd) = X(rs1) & ~(1 << (shamt & (XLEN - 1))) | NONE | NONE | This instruction returns rs1 with a single bit cleared at the index specified in shamt. The index is read from the lower log2(XLEN) bits of shamt. For RV32, the encodings corresponding to shamt[5]=1 are reserved. | Single_bit_Operations +| BEXT | bext rd, rs1, rs2 | X(rd) = (X(rs1) >> (X(rs2) & (XLEN - 1))) & 1 | NONE | NONE | This instruction returns a single bit extracted from rs1 at the index specified in rs2. The index is read from the lower log2(XLEN) bits of rs2. | Single_bit_Operations +| BEXTI | bexti rd, rs1, shamt | X(rd) = (X(rs1) >> (shamt & (XLEN - 1))) & 1 | NONE | NONE | This instruction returns a single bit extracted from rs1 at the index specified in rs2. The index is read from the lower log2(XLEN) bits of shamt. For RV32, the encodings corresponding to shamt[5]=1 are reserved. | Single_bit_Operations +| BINV | binv rd, rs1, rs2 | X(rd) = X(rs1) ^ (1 << (X(rs2) & (XLEN - 1))) | NONE | NONE | This instruction returns rs1 with a single bit inverted at the index specified in rs2. The index is read from the lower log2(XLEN) bits of rs2. | Single_bit_Operations +| BINVI | binvi rd, rs1, shamt | X(rd) = X(rs1) ^ (1 << (shamt & (XLEN - 1))) | NONE | NONE | This instruction returns rs1 with a single bit inverted at the index specified in shamt. The index is read from the lower log2(XLEN) bits of shamt. For RV32, the encodings corresponding to shamt[5]=1 are reserved. | Single_bit_Operations +| BSET | bset rd, rs1, rs2 | X(rd) = X(rs1) \| (1 << (X(rs2) & (XLEN - 1))) | NONE | NONE | This instruction returns rs1 with a single bit set at the index specified in rs2. The index is read from the lower log2(XLEN) bits of rs2. | Single_bit_Operations +| BSETI | bseti rd, rs1, shamt | X(rd) = X(rs1) \| (1 << (shamt & (XLEN - 1))) | NONE | NONE | This instruction returns rs1 with a single bit set at the index specified in shamt. The index is read from the lower log2(XLEN) bits of shamt. For RV32, the encodings corresponding to shamt[5]=1 are reserved. | Single_bit_Operations +|=== + diff --git a/config/gen_from_riscv_config/cv32a65x/isa/isa.rst b/config/gen_from_riscv_config/cv32a65x/isa/isa.rst index 21bf0a004d..2d02548c4e 100644 --- a/config/gen_from_riscv_config/cv32a65x/isa/isa.rst +++ b/config/gen_from_riscv_config/cv32a65x/isa/isa.rst @@ -32,9 +32,6 @@ Instructions +---------------+-----------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | Zicsr | RV32Zicsr Control and Status Register Instructions_ | All CSR instructions atomically read-modify-write a single CSR, whose CSR specifier is encoded in the 12-bit csr field of the instruction held in bits 31–20. The immediate forms use a 5-bit zero-extended immediate encoded in the rs1 field. | +---------------+-----------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ -| Zifencei | RVZifencei Instruction Fetch Fence_ | FENCE.I instruction that provides explicit synchronization between writes to instruction memory and instruction fetches on the same hart. | -| | | Currently, this instruction is the only standard mechanism to ensure that stores visible to a hart will also be visible to it instruction fetches. | -+---------------+-----------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | Zcb | RV32Zcb Code Size Reduction Instructions_ | Zcb belongs to the group of extensions called RISC-V Code Size Reduction Extension (Zc*). Zc* has become the superset of the Standard C extension adding more 16-bit instructions to the ISA. Zcb includes the 16-bit version of additional Integer (I), Multiply (M), and Bit-Manipulation (Zbb) Instructions. All the Zcb instructions require at least standard C extension support as a prerequisite, along with M and Zbb extensions for the 16-bit version of the respective instructions. | +---------------+-----------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | Zba | RVZba Address generation instructions_ | The Zba instructions can be used to accelerate the generation of addresses that index into arrays of basic types (halfword, word, doubleword) using both unsigned word-sized and XLEN-sized indices: a shifted index is added to a base address. The shift and add instructions do a left shift of 1, 2, or 3 because these are commonly found in real-world code and because they can be implemented with a minimal amount of additional hardware beyond that of the simple adder. This avoids lengthening the critical path in implementations. While the shift and add instructions are limited to a maximum left shift of 3, the slli instruction (from the base ISA) can be used to perform similar shifts for indexing into arrays of wider elements. The slli.uw added in this extension can be used when the index is to be interpreted as an unsigned word. | @@ -243,16 +240,6 @@ RV32Zicsr Control and Status Register Instructions | CSRRCI | csrrci rd, csr, uimm[4:0] | t = CSRs[csr]; CSRs[csr] = t & ∼zext(uimm[4:0]); x[rd] = t | NONE | Attempts to access a non-existent CSR raise an illegal instruction exception. Attempts to access a CSR without appropriate privilege level or to write a read-only register also raise illegal instruction exceptions. | Reads the value of the CSR, zero-extends the value to 32 bits, and writes it to integer register rd. The zero-extends immediate value is treated as a bit mask that specifies bit positions to be cleared in the CSR. Any bit that is high in zero-extends immediate will cause the corresponding bit to be set in the CSR, if that CSR bit is writable. Other bits in the CSR are unaffected (though CSRs might have side effects when written). If the uimm[4:0] field is zero, then these instructions will not write to the CSR, and shall not cause any of the side effects that might otherwise occur on a CSR write. | Control and Status Register Operations | +--------+---------------------------+------------------------------------------------------------+------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------+ -RVZifencei Instruction Fetch Fence ----------------------------------- - - -+---------+----------+---------------------+------------------+--------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------+ -| Name | Format | Pseudocode | Invalid_values | Exception_raised | Description | Op Name | -+=========+==========+=====================+==================+====================+==============================================================================================================================================================================================================================================================================================================================================================================================================================+========================+ -| FENCE.I | fence.i | Fence(Store, Fetch) | NONE | NONE | The FENCE.I instruction is used to synchronize the instruction and data streams. RISC-V does not guarantee that stores to instruction memory will be made visible to instruction fetches on the same RISC-V hart until a FENCE.I instruction is executed. A FENCE.I instruction only ensures that a subsequent instruction fetch on a RISC-V hart will see any previous data stores already visible to the same RISC-V hart. | Fetch Fence Operations | -+---------+----------+---------------------+------------------+--------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------+ - RV32Zcb Code Size Reduction Instructions ---------------------------------------- diff --git a/config/gen_from_riscv_config/cv32a65x/linker/link.ld b/config/gen_from_riscv_config/cv32a65x/linker/link.ld new file mode 100644 index 0000000000..a134ec289a --- /dev/null +++ b/config/gen_from_riscv_config/cv32a65x/linker/link.ld @@ -0,0 +1,82 @@ +/*======================================================================*/ +/* Proxy kernel linker script */ +/*======================================================================*/ +/* This is the linker script used when building the proxy kernel. */ + +/*----------------------------------------------------------------------*/ +/* Setup */ +/*----------------------------------------------------------------------*/ + +/* The OUTPUT_ARCH command specifies the machine architecture where the + argument is one of the names used in the BFD library. More + specifically one of the entires in bfd/cpu-mips.c */ + +OUTPUT_ARCH( "riscv" ) +ENTRY(_start) + +/*----------------------------------------------------------------------*/ +/* Sections */ +/*----------------------------------------------------------------------*/ + +SECTIONS +{ + + /* text: test code section */ + . = 0x80000000; + _start_text = .; + .text.init : { *(.text.init) } + + . = ALIGN(0x1000); + .tohost : { *(.tohost) } + + . = ALIGN(0x1000); + .uvmif : { *(.uvmif) } + + . = ALIGN(0x1000); + .text : { *(.text) } + . = ALIGN(0x1000); + .text.startup : { *(.text.startup) } + . = ALIGN(0x1000); + _end_text = .; + . = ALIGN(0x1000); + .rodata : { *(.rodata*)} + . = ALIGN(0x8); + . = ALIGN(0x1000); + .page_table : { *(.page_table) } + .user_stack : { *(.user_stack) } + .kernel_data : { *(.kernel_data) } + .kernel_stack : { *(.kernel_stack) } + + /* data segment */ + .data : { *(.data) } + + .sdata : { + __global_pointer$ = . + 0x800; + *(.srodata.cst16) *(.srodata.cst8) *(.srodata.cst4) *(.srodata.cst2) *(.srodata*) + *(.sdata .sdata.* .gnu.linkonce.s.*) + } + + /* bss segment */ + .sbss : { + *(.sbss .sbss.* .gnu.linkonce.sb.*) + *(.scommon) + } + .bss : { *(.bss) } + + /* thread-local data segment */ + .tdata : + { + _tdata_begin = .; + *(.tdata) + _tdata_end = .; + } + .tbss : + { + *(.tbss) + _tbss_end = .; + } + + /* End of uninitalized data segement */ + _end = .; +} + diff --git a/config/gen_from_riscv_config/cv32a65x/spike/spike.yaml b/config/gen_from_riscv_config/cv32a65x/spike/spike.yaml index c6b7c26ee1..cc07e68fc5 100644 --- a/config/gen_from_riscv_config/cv32a65x/spike/spike.yaml +++ b/config/gen_from_riscv_config/cv32a65x/spike/spike.yaml @@ -1,28 +1,54 @@ spike_param_tree: bootrom: true - bootrom_base: 0x10000 - bootrom_size: 0x1000 + bootrom_base: 65536 + bootrom_size: 4096 dram: true - dram_base: 0x80000000 - dram_size: 0x40000000 + dram_base: 2147483648 + dram_size: 1073741824 generic_core_config: false max_steps: 200000 max_steps_enabled: false - isa: rv32imc_zba_zbb_zbs_zbc_zicsr_zifencei - priv: MSU + isa: rv32imczicsr_zcb_zba_zbb_zbc_zbs + priv: M core_configs: - - isa: rv32imc_zba_zbb_zbs_zbc_zicsr_zifencei - marchid: 0x3 - misa_we: false - misa_we_enable: true - misaligned: false - mmu_mode: sv39 - mvendorid: 0x00000602 - pmpaddr0: 0x0 - pmpcfg0: 0x0 - pmpregions: 0x0 - priv: MSU + - + isa: rv32imczicsr_zcb_zba_zbb_zbc_zbs + extensions: cv32a60x,cvxif + boot_addr: 2147483648 + marchid_override_mask: 0xFFFFFFFF + marchid_override_value: 0x3 + misa_write_mask: 0x0 + pmp_granularity: 8 + pmpaddr0: 0 + pmpcfg0: 0 + pmpregions_max: 64 + pmpregions_writable: 8 + priv: M status_fs_field_we: false status_fs_field_we_enable: false status_vs_field_we: false status_vs_field_we_enable: false + mstatus_write_mask: 136 + mstatus_override_mask: 6144 + mie_write_mask: 0x00000880 + mie_override_mask: 0xfffff77f + mie_override_value: 0x00000000 + mip_write_mask: 0x00000000 + mip_override_mask: 0xfffff77f + mip_override_value: 0x00000000 + mtval_write_mask: 0 + tinfo_accessible: 0 + mscontext_accessible: 0 + mcontext_accessible: 0 + tdata1_accessible: 0 + tdata2_accessible: 0 + tdata3_accessible: 0 + tselect_accessible: 0 + mhartid: 0 + mvendorid_override_mask : 0xFFFFFFFF + mvendorid_override_value: 1538 + csr_counters_injection: true + interrupts_injection: true + unified_traps: true + mcycleh_implemented: false + mhpmevent31_implemented: false diff --git a/verif/sim/link.ld b/config/gen_from_riscv_config/linker/link.ld similarity index 97% rename from verif/sim/link.ld rename to config/gen_from_riscv_config/linker/link.ld index d064469c6b..1fe4a63875 100644 --- a/verif/sim/link.ld +++ b/config/gen_from_riscv_config/linker/link.ld @@ -24,6 +24,8 @@ SECTIONS . = ALIGN(0x1000); .tohost : { *(.tohost) } . = ALIGN(0x1000); + .uvmif : { *(.uvmif) } + . = ALIGN(0x1000); .text : { *(.text) } . = ALIGN(0x1000); .text.startup : { *(.text.startup) } diff --git a/config/gen_from_riscv_config/requirements.txt b/config/gen_from_riscv_config/requirements.txt index f97f9c9699..50404b7d41 100644 --- a/config/gen_from_riscv_config/requirements.txt +++ b/config/gen_from_riscv_config/requirements.txt @@ -3,4 +3,5 @@ pyyaml mdutils restructuredtext-lint rstcloth -regex \ No newline at end of file +regex +Mako \ No newline at end of file diff --git a/config/gen_from_riscv_config/scripts/libs/csr_factorizer.py b/config/gen_from_riscv_config/scripts/libs/csr_factorizer.py index c4a2f079eb..e291f6303e 100644 --- a/config/gen_from_riscv_config/scripts/libs/csr_factorizer.py +++ b/config/gen_from_riscv_config/scripts/libs/csr_factorizer.py @@ -24,6 +24,7 @@ def address_to_key(address): def factorizer(yaml_data): privname = None + legalname= None fieldname = [] regname = [] regdescr = [] @@ -37,9 +38,9 @@ def factorizer(yaml_data): suffix_address = [] suffix_number = [] key_to_remove = [] - for key, value in yaml_data["hart0"].items(): + for key, value in yaml_data.items(): if isinstance(value, dict): - regelement = yaml_data["hart0"].get(key, {}) + regelement = yaml_data.get(key, {}) if regelement.get("address", None): regaddress = hex(regelement.get("address", None)) else: @@ -49,11 +50,16 @@ def factorizer(yaml_data): else: desc = "" if regelement.get("rv32", "")["accessible"]: + fields = regelement.get("rv32", "").get("fields", []) + if not fields : + legal = regelement.get("rv32", "").get("type", None).keys() if regelement.get("rv32", "").get("type", None) is not None else None + else : + legal = [regelement.get("rv32", "").get(item, {}).get("type").keys()for item in fields if not isinstance(item, list) and regelement.get("rv32", "").get(item, {}).get("type") is not None] pattern = r"(\D+)(\d+)(.*)" match = re.search(pattern, key) if match: key_to_remove.append(key) - if privname and match.group(1) == privname.group(1): + if privname and match.group(1) == privname.group(1) and legalname == legal: if len(match.group(3)) > 0: suffix_name.append(match.group(0)) field_suffix.append(match.group(1)) @@ -75,7 +81,7 @@ def factorizer(yaml_data): start_address = hex(int(regadress[0], 16)) desc = str(regdescr[0]) desc = re.sub(str(regname[0]), fieldname[0], desc) - modified_data = yaml_data["hart0"][regname[0]].copy() + modified_data = yaml_data[regname[0]].copy() modified_data["address"] = ( f"{str(start_address)}-{str(regadress[-1])}" ) @@ -98,13 +104,14 @@ def factorizer(yaml_data): suffix_address = sorted(suffix_address, key=address_to_key) desc = str(suffix_descr[0]) desc = re.sub(str(suffix_name[0]), field_suffix[0], desc) - modified_data = yaml_data["hart0"][suffix_name[0]].copy() + modified_data = yaml_data[suffix_name[0]].copy() modified_data["address"] = ( f"{str(suffix_address[0])}-{str(suffix_address[-1])}" ) new_regname.append( f"{field_suffix[0]}[{suffix_number[0]}-{suffix_number[-1]}]h" ) + print(new_regname) data.append(modified_data) suffix_name = [] field_suffix = [] @@ -112,12 +119,13 @@ def factorizer(yaml_data): suffix_number = [match.group(2)] suffix_address = [] privname = match + legalname = legal if regname: start_address = hex(int(regadress[0], 16)) end_address = str(regadress[-1]) desc = str(regdescr[0]) desc = re.sub(str(regname[0]), fieldname[0], desc) - modified_data = yaml_data["hart0"][regname[0]].copy() + modified_data = yaml_data[regname[0]].copy() modified_data["description"] = desc modified_data["address"] = f"{str(start_address)}-{str(end_address)}" new_regname.append(f"{fieldname[0]}[{reg_number[0]}-{reg_number[-1]}]") @@ -129,7 +137,7 @@ def factorizer(yaml_data): if suffix_name: desc = str(suffix_descr[0]) desc = re.sub(str(suffix_name[0]), field_suffix[0], desc) - modified_data = yaml_data["hart0"][suffix_name[0]].copy() + modified_data = yaml_data[suffix_name[0]].copy() modified_data["description"] = desc modified_data["address"] = ( f"{str(hex(int(suffix_address[0],16)))}-{str(suffix_address[-1])}" @@ -143,7 +151,7 @@ def factorizer(yaml_data): regdescr = [] regadress = [] for index, reg in enumerate(new_regname): - yaml_data["hart0"][reg] = data[index] + yaml_data[reg] = data[index] for key in key_to_remove: - del yaml_data["hart0"][key] - return yaml_data["hart0"] + del yaml_data[key] + return yaml_data diff --git a/config/gen_from_riscv_config/scripts/libs/csr_updater.py b/config/gen_from_riscv_config/scripts/libs/csr_updater.py index 5a0b212996..2a5077d577 100644 --- a/config/gen_from_riscv_config/scripts/libs/csr_updater.py +++ b/config/gen_from_riscv_config/scripts/libs/csr_updater.py @@ -5,46 +5,47 @@ def csr_recursive_update(original_dict, csr_update): """ Gets the data of the RISC-V Config Yaml file and - updates the value of sub key in the RISC-V Config Yaml file - (ex: reset-val, shadow_type) - :param original_dict: parsed data of the RISC-V Config Yaml file - :param csr_update: parsed data of the CSR updater + update the value of sub key in RISC-V Config Yaml file + (ex: reset-val , address) + :param original_dict : parsed data of RISC-V Config Yaml file + csr_update : parsed data of CSR updater :return: data of RISC-V Config Yaml file updated """ for key, value in csr_update.items(): if key in original_dict: if isinstance(value, dict) and isinstance(original_dict[key], dict): - # If both are dicts, recurse - if key == "type": - # Replace the entire type dictionary + if key == "rv32": original_dict[key] = value else: csr_recursive_update(original_dict[key], value) else: - # Replace the original value with the update value original_dict[key] = value -def csr_formatter(srcfile, customfile, modifile): +def csr_formatter(srcfile, customfile, debugfile, modifile): # Read original dictionary from YAML source file with open(srcfile, "r", encoding="utf-8") as file: original_dict = yaml.safe_load(file) with open(customfile, "r", encoding="utf-8") as file: custom_dict = yaml.safe_load(file) - - isa_data = original_dict.copy() - isa_data['hart0'].update(custom_dict["hart0"]) - updated_values = {} + debug_dict = {} + riscv_config_data = original_dict.copy() + if debugfile is not None: + with open(debugfile, "r", encoding="utf-8") as file: + debug_dict = yaml.safe_load(file) + if debug_dict["hart0"]["debug_mode"]: + riscv_config_data["hart0"].update(debug_dict["hart0"]) + riscv_config_data["hart0"].update(custom_dict["hart0"]) + update_dict = {} if modifile is not None: with open(modifile, "r", encoding="utf-8") as file: - updated_values = yaml.safe_load(file) - + update_dict = yaml.safe_load(file) + print(riscv_config_data["hart0"]) # Update original_dict with values from updated_values recursively - csr_recursive_update(isa_data["hart0"], updated_values) - + csr_recursive_update(riscv_config_data["hart0"], update_dict) # Identify and remove keys within the range specified for each register keys_to_remove = [] - for key, value in updated_values.items(): + for key, value in update_dict.items(): if "range" in value: range_value = value["range"] pattern = rf"{key}(\d+)" @@ -55,7 +56,7 @@ def csr_formatter(srcfile, customfile, modifile): if index >= range_value: keys_to_remove.append(k) # Remove excluded keys based on the condition - exclude_data = updated_values.get("exclude") + exclude_data = update_dict.get("exclude") if exclude_data: exclude_key = exclude_data.get("key") sub_key = exclude_data.get("sub_key") @@ -75,12 +76,12 @@ def remove_keys_recursive(dictionary): for k in keys_to_remove: dictionary.pop(k) - remove_keys_recursive(isa_data["hart0"]) - remove_keys_recursive(isa_data["hart0"]) + remove_keys_recursive(riscv_config_data["hart0"]) + remove_keys_recursive(riscv_config_data["hart0"]) # Remove keys from original_dict for k in keys_to_remove: - isa_data["hart0"].pop(k, None) + riscv_config_data["hart0"].pop(k, None) # Remove keys from original_dict for k in keys_to_remove: - isa_data.pop(k, None) - return isa_data + riscv_config_data.pop(k, None) + return riscv_config_data["hart0"] diff --git a/config/gen_from_riscv_config/scripts/libs/spike_updater.py b/config/gen_from_riscv_config/scripts/libs/spike_updater.py new file mode 100644 index 0000000000..79a48b7e7a --- /dev/null +++ b/config/gen_from_riscv_config/scripts/libs/spike_updater.py @@ -0,0 +1,74 @@ +# Copyright 2024 Thales DIS France SAS +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Original Author: Oukalrazqou Abdessamii + + +""" Module is used to update Spike based on yaml file called spike updater """ +import re +import yaml +from yaml import BaseLoader + + +def spike_recursive_update(original_dict, spike_update): + """ + Gets the data of the RISC-V Config Yaml file and + update the value of sub key in RISC-V Config Yaml file + (ex: priv , pmpaddr) + :param original_dict : parsed data of Spike Yaml file + spike_update : parsed data of Spike updaters + :return: data of Spike Yaml file updated + """ + for key, value in spike_update.items(): + if key in original_dict: + if isinstance(value, dict) and isinstance(original_dict[key], dict): + if key == "cores": + original_dict[key] = value + else: + csr_recursive_update(original_dict[key], value) + else: + original_dict[key] = value + + +def is_hex_string(s): + return bool(re.match(r"'?(0x[0-9a-fA-F]+)'?", s)) + + +def custom_convert(data): + if isinstance(data, dict): + return {k: custom_convert(v) for k, v in data.items()} + elif isinstance(data, list): + return [custom_convert(item) for item in data] + elif isinstance(data, str): + if data.lower() == "true": + return True + elif data.lower() == "false": + return False + elif data.isdigit(): + return int(data) + elif is_hex_string(data.strip()): + return int(data, 16) + return data + + +def spike_formatter(original_dict, modifile): + # Read original dictionary from YAML Source file + updated_values = {} + if modifile is not None: + with open(modifile, "r", encoding="utf-8") as file: + updated_values = yaml.load(file, Loader=BaseLoader) + # Update original_dict with values from updated_values recursively + spike_recursive_update(original_dict["spike_param_tree"], updated_values) + original_dict = custom_convert(original_dict) + return original_dict diff --git a/config/gen_from_riscv_config/scripts/libs/utils.py b/config/gen_from_riscv_config/scripts/libs/utils.py index 8348cd2777..796a6057ce 100644 --- a/config/gen_from_riscv_config/scripts/libs/utils.py +++ b/config/gen_from_riscv_config/scripts/libs/utils.py @@ -14,28 +14,31 @@ # # Original Author: Oukalrazqou Abdessamii -""" Module is used to gather all utils and function to generate the csr and isa documents""" +"""Module is used to gather all utils and function to generate the csr and isa documents""" import io import os import re import yaml +from yaml import BaseLoader import rstcloth +import json +from mako.template import Template import libs.isa_updater import libs.csr_updater +import libs.spike_updater import libs.csr_factorizer from rstcloth import RstCloth from mdutils.mdutils import MdUtils from libs.isa_updater import isa_filter from libs.csr_updater import csr_formatter +from libs.spike_updater import spike_formatter from libs.csr_factorizer import factorizer -pattern_warl = ( - r"\b(?:warl|wlrl|ro_constant|ro_variable|rw|ro)\b" # pattern to detect warl in field -) +pattern_warl = r"\b(?:warl|wlrl|ro_constant|ro_variable|rw|ro)\b" # pattern to detect warl in field pattern_legal_dict = r"\[(0x[0-9A-Fa-f]+)(.*?(0x[0-9A-Fa-f]+))?\]" # pattern to detect if warl field is dict pattern_legal_list = r"\[(0x[0-9A-Fa-f]+)(.*?(0x[0-9A-Fa-f]+))?\]" # pattern to detect if warl field is a list -Factorizer_pattern = r".*(\d).*" # pattern to detect factorized fields +Factorizer_pattern = r"\d+" # pattern to detect factorized fields class DocumentClass: @@ -131,16 +134,32 @@ class Render: """Collection of general rendering methods which can be overridden if needed for a specific output format.""" + @staticmethod + def is_decimal(value): + """return a bool checking if value is decimal""" + try: + int( + value + ) # Alternatively, use float(value) if you want to check for floating point numbers + return True + except ValueError: + return False + @staticmethod def range(start, end): """Return a string representing the range START..END, inclusive. START and END are strings representing numerical values.""" + if Render.is_decimal(start): + start = hex(int(start)) + if Render.is_decimal(end): + end = hex(int(end)) return f"{start} - {end}" @staticmethod def value_set(values): """Return a string representing the set of values in VALUES. VALUES is a list of strings.""" + # values = [hex(int(value, 16)) if '0x' in value and '-' not in value else value for value in values] return ", ".join(values) @staticmethod @@ -162,6 +181,72 @@ def fieldtype(typ): return upcased +class CoreConfig: + def __init__( + self, + isa, + marchid, + misa_we, + misa_we_enable, + misaligned, + mmu_mode, + mvendorid, + pmpaddr0, + pmpcfg0, + pmpregions, + priv, + status_fs_field_we, + status_fs_field_we_enable, + status_vs_field_we, + status_vs_field_we_enable, + ): + self.isa = isa + self.marchid = marchid + self.misa_we = misa_we + self.misa_we_enable = misa_we_enable + self.misaligned = misaligned + self.mmu_mode = mmu_mode + self.mvendorid = mvendorid + self.pmpaddr0 = pmpaddr0 + self.pmpcfg0 = pmpcfg0 + self.pmpregions = pmpregions + self.priv = priv + self.status_fs_field_we = status_fs_field_we + self.status_fs_field_we_enable = status_fs_field_we_enable + self.status_vs_field_we = status_vs_field_we + self.status_vs_field_we_enable = status_vs_field_we_enable + + +class Spike: + def __init__( + self, + bootrom, + bootrom_base, + bootrom_size, + dram, + dram_base, + dram_size, + generic_core_config, + max_steps, + max_steps_enabled, + isa, + priv, + core_configs, + ): + self.bootrom = bootrom + self.bootrom_base = bootrom_base + self.bootrom_size = bootrom_size + self.dram = dram + self.dram_base = dram_base + self.dram_size = dram_size + self.generic_core_config = generic_core_config + self.max_steps = max_steps + self.max_steps_enabled = max_steps_enabled + self.isa = isa + self.priv = priv + self.core_configs = core_configs + + # --------------------------------------------------------------# class ISAdocumentClass: """ISA document class""" @@ -175,7 +260,7 @@ def addInstructionMapBlock(self, InstructionMap): class InstructionMapClass: - """ISA instruction map class""" + """ISA instruction map c.2n lass""" def __init__(self, name): self.name = name @@ -369,11 +454,7 @@ def returnAsString(self): field.name.upper(), field.fieldreset, field.fieldaccess, - ( - Render.bitmask(field.andMask, field.orMask) - if field.andMask and field.orMask - else field.bitlegal - ), + field.bitlegal, ] _line.append(field.fieldDesc) reg_table.append(_line) @@ -454,6 +535,193 @@ def returnAsString(self): r.table(header=_headers, data=reg_table) return r.data +class AdocAddressBlock(AddressBlockClass): + """Generates an AsciiDoc file from a IP-XACT register description""" + + def __init__(self, name): + super().__init__("csr") + self.name = name + self.registerList = [] + self.suffix = ".adoc" + + def get_access_privilege(self, reg): + """Registers with address bits [11:10] == 2'b11 are Read-Only + as per privileged ISA spec.""" + # Handle register address ranges separated by dashes. + if (int(reg.address.split("-")[0], 0) & 0xC00) == 0xC00: + return "RO" + else: + return "RW" + + def generate_label(self, name): + return "_" + name.replace('[','').replace(']','').upper() + + def returnAsString(self): + registerlist = sorted(self.registerList, key=lambda reg: reg.address) + r = "" + regNameList = [reg.name.upper() for reg in registerlist] + regAddressList = [reg.address for reg in registerlist] + regPrivModeList = [reg.access for reg in registerlist] + regPrivAccessList = [self.get_access_privilege(reg) for reg in registerlist] + regDescrList = [reg.desc for reg in registerlist] + regRV32List = [reg.RV32 for reg in registerlist] + regRV64List = [reg.RV64 for reg in registerlist] + + r += "////\n" + r += " Copyright (c) 2024 OpenHW Group\n" + r += " Copyright (c) 2024 Thales\n" + r += " SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1\n" + r += " Author: Abdessamii Oukalrazqou\n" + r += "////\n\n" + + r += "=== %s\n\n"%self.name + r += "==== Conventions\n\n" + + r += "In the subsequent sections, register fields are labeled with one of the following abbreviations:\n\n" + + r += "* WPRI (Writes Preserve Values, Reads Ignore Values): read/write field reserved\n" + r += "for future use. For forward compatibility, implementations that do not\n" + r += "furnish these fields must make them read-only zero.\n" + + r += "* WLRL (Write/Read Only Legal Values): read/write CSR field that specifies\n" + r += "behavior for only a subset of possible bit encodings, with other bit encodings\n" + r += "reserved.\n" + + r += "* WARL (Write Any Values, Reads Legal Values): read/write CSR fields which are\n" + r += "only defined for a subset of bit encodings, but allow any value to be written\n" + r += "while guaranteeing to return a legal value whenever read.\n" + + r += "* ROCST (Read-Only Constant): A special case of WARL field which admits only one\n" + r += "legal value, and therefore, behaves as a constant field that silently ignores\n" + r += "writes.\n" + + r += "* ROVAR (Read-Only Variable): A special case of WARL field which can take\n" + r += "multiple legal values but cannot be modified by software and depends only on\n" + r += "the architectural state of the hart.\n\n" + + r += "In particular, a register that is not internally divided\n" + r += "into multiple fields can be considered as containing a single field of XLEN bits.\n" + r += "This allows to clearly represent read-write registers holding a single legal value\n" + r += "(typically zero).\n\n" + + r += "==== Register Summary\n\n" + + r += "|===\n" + r += "|Address | Register Name | Privilege | Description\n\n" + for i, _ in enumerate(regNameList): + if regRV32List[i] | regRV64List[i]: + r += "|" + regAddressList[i] + \ + f"| `<<{self.generate_label(regNameList[i])},{regNameList[i].upper()}>>`" + \ + "|" + regPrivModeList[i] + regPrivAccessList[i] + \ + "|" + str(regDescrList[i]) + "\n" + r += "|===\n\n" + + r += "==== Register Description\n\n" + for reg in registerlist: + if reg.RV32 | reg.RV64: + r += "[[%s]]\n"%self.generate_label(reg.name) + r += "===== %s\n\n"%reg.name.upper() + + r += "Address:: %s\n"%reg.address + if reg.resetValue: + # display the resetvalue in hex notation in the full length of the register + r += "Reset Value:: 0x%s\n"%f"{reg.resetValue[2:].zfill(int(reg.size/4))}" + # RO/RW privileges are encoded in register address. + r += "Privilege:: %s\n"%(reg.access + self.get_access_privilege(reg)) + r += "Description:: %s\n\n"%(reg.desc) + + reg_table = [] + for field in reg.field: + if field.bitWidth == 1: # only one bit -> no range needed + bits = f"{field.bitlsb}" + else: + bits = f"[{field.bitmsb}:{field.bitlsb}]" + _line = [ + bits, + field.name.upper(), + field.fieldreset, + field.fieldaccess, + ( + Render.bitmask(field.andMask, field.orMask) + if field.andMask and field.orMask + else field.bitlegal + ), + ] + _line.append(field.fieldDesc) + reg_table.append(_line) + + reg_table = sorted( + reg_table, key=lambda x: int(x[0].strip("[]").split(":")[0]) + ) + # table of the register + r += "|===\n" + r += "| Bits | Field Name | Reset Value | Type | Legal Values | Description\n\n" + for reg in reg_table: + for col in reg: + if col == 'Reserved': + col = "_Reserved_" + r +="| %s "%col.replace('\n','').replace('|', '\|') + r += "\n" + r += "|===\n\n" + + return r + +class InstadocBlock(InstructionBlockClass): + """Generates a ISA AsciiDoc file from RISC-V Config Yaml register description""" + + def __init__(self, name): + super().__init__("isa") + self.name = name + self.Instructionlist = [] + self.suffix = ".adoc" + + def returnAsString(self): + r = "" + InstrNameList = [reg.key for reg in self.Instructionlist] + InstrDescrList = [reg.descr for reg in self.Instructionlist] + InstrExtList = [reg.Extension_Name for reg in self.Instructionlist] + + r += "////\n" + r += " Copyright (c) 2024 OpenHW Group\n" + r += " Copyright (c) 2024 Thales\n" + r += " SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1\n" + r += " Author: Abdessamii Oukalrazqou\n" + r += "////\n\n" + + r += "=== %s\n\n"%self.name + r += "==== Instructions\n\n" + + r += "|===\n" + r += "|Subset Name | Name | Description\n\n" + for i, _ in enumerate(InstrNameList): + r += "|%s | %s | %s\n"%(str(InstrExtList[i]), + str(InstrNameList[i]), + str(InstrDescrList[i]).replace('\n','')) + r += "|===\n\n" + + for reg in self.Instructionlist: + reg_table = [] + if len(reg.Name) > 0: + r += "==== %s\n\n"%reg.key + r += "|===\n" + r += "| Name | Format | Pseudocode|Invalid_values | Exception_raised | Description| Op Name\n\n" + + for fieldIndex in list(range(len(reg.Name))): + _line = [ + reg.Name[fieldIndex], + reg.Format[fieldIndex], + reg.pseudocode[fieldIndex].replace('|','\|'), + reg.invalid_values[fieldIndex], + reg.exception_raised[fieldIndex], + reg.Description[fieldIndex], + ] + _line.append(reg.OperationName[fieldIndex]) + + for col in _line: + r +="| %s "%col.replace('\n','') + r += "\n" + r += "|===\n\n" + return r class InstmdBlock(InstructionBlockClass): """Generates an ISA Markdown file from a RISC Config Yaml register description""" @@ -672,9 +940,10 @@ def returnMdRegDesc(self, name, address, resetValue, desc, access): class CsrParser: """parse CSR RISC-V config yaml file""" - def __init__(self, srcFile,customFile, target, modiFile=None): + def __init__(self, srcFile, customFile, debugfile, target, modiFile=None): self.srcFile = srcFile self.customFile = customFile + self.debugfile = debugfile self.modiFile = modiFile self.target = target @@ -744,9 +1013,11 @@ def returnRegister( if matches: expr_type = str(matches.group(2)) if expr_type == "bitmask": - # legal_value is left at default, cf. Render.bitmask(). andMask = str(matches.group(4)) orMask = str(matches.group(5)) + legal_value = Render.bitmask( + andMask, orMask + ) elif expr_type == "in": if matches.group(3).find(",") >= 0: # list ==> set of values @@ -772,16 +1043,30 @@ def returnRegister( legal_value = matches.group(3) bitlegal = legal_value elif isinstance(legal_2, list): - pattern = r"\s*((?:0x)?[0-9A-Fa-f]+)\s*(.)\s*((?:0x)?[0-9A-Fa-f]+)\s*" - matches = re.search(pattern, legal_2[0]) - if matches: - legal_value = ( - Render.range( - matches.group(1), matches.group(3) - ) - if matches.group(2) == ":" - else Render.value_set(legal_2[0].split(",")) - ) + pattern = r"\s*((?:0x)?[0-9A-Fa-f]+)\s*(:|,?)\s*((?:0x)?[0-9A-Fa-f]+)?" + for value in legal_2: + value_list = value.split(",") + processed_values = [] + for val in value_list: + matches = re.search(pattern, val) + if matches: + first_value = matches.group(1) + separator = matches.group(2) + second_value = ( + matches.group(3) + if matches.group(3) + else first_value + ) + if separator == ":": + processed_value = Render.range( + first_value, second_value + ) + else: + processed_value = hex( + int(first_value) + ) + processed_values.append(processed_value) + legal_value = Render.value_set(processed_values) bitlegal = legal_value else: legal_value = hex(legal_2) @@ -791,9 +1076,13 @@ def returnRegister( if match: match_field = re.search(Factorizer_pattern, str(item)) if match_field: + if match_field.group(0) not in {"0", "1", "2", "3"}: + field_number = int(match_field.group(0)) - 8 + else: + field_number = match_field.group(0) fieldName = re.sub( - match_field.group(1), - f"[i*4 + {match_field.group(1)}]", + Factorizer_pattern, + f"[i*4 +{field_number}]", item, ) else: @@ -819,7 +1108,7 @@ def returnRegister( legal = "" fieldaccess = "WPRI" bitWidth = int(item_[len(item_) - 1]) - int(item_[0]) + 1 - fieldDesc = "*Reserved*" + fieldDesc = "Reserved" bitlegal = legal fieldreset = hex( int(resetValue, 16) >> (bitlsb) & ((1 << ((bitWidth))) - 1) @@ -910,18 +1199,32 @@ def returnRegister( legal_value = matches.group(3) bitlegal = legal_value elif isinstance(legal_2, list): - pattern = r"\s*((?:0x)?[0-9A-Fa-f]+)\s*(.)\s*((?:0x)?[0-9A-Fa-f]+)\s*" - matches = re.search(pattern, legal_2[0]) - if matches: - legal_value = ( - Render.range(matches.group(1), matches.group(3)) - if matches.group(2) == ":" - else Render.value_set(legal_2[0].split(",")) - ) + pattern = r"\s*((?:0x)?[0-9A-Fa-f]+)\s*(:|,?)\s*((?:0x)?[0-9A-Fa-f]+)?" + for value in legal_2: + value_list = value.split(",") + processed_values = [] + for val in value_list: + matches = re.search(pattern, val) + if matches: + first_value = matches.group(1) + separator = matches.group(2) + second_value = ( + matches.group(3) + if matches.group(3) + else first_value + ) + if separator == ":": + processed_value = Render.range( + first_value, second_value + ) + else: + processed_value = hex(int(first_value)) + processed_values.append(processed_value) + legal_value = Render.value_set(processed_values) bitlegal = legal_value else: - bitmask = 0 - bitlegal = "0x" + hex(legal_2)[2:].zfill(int(size / 4)) + legal_value = hex(legal_2) + bitlegal = legal_value fieldDesc = regDesc fieldreset = "0x" + hex(int(resetValue, 16))[2:].zfill(int(size / 4)) if bitlsb is None: @@ -950,9 +1253,14 @@ def returnRegister( def returnDocument(self): with open(self.srcFile, "r", encoding="utf-8") as f: data = yaml.safe_load(f) - data = csr_formatter(self.srcFile,self.customFile, self.modiFile) - Registers = factorizer(data) docName = data["hart0"] + size = int( + data["hart0"].get("supported_xlen", "")[0] + ) # depends on architecture + data = csr_formatter( + self.srcFile, self.customFile, self.debugfile, self.modiFile + ) + Registers = factorizer(data) d = DocumentClass(docName) m = MemoryMapClass(docName) a = AddressBlockClass("csr") @@ -966,7 +1274,7 @@ def returnDocument(self): else hex(RegElement.get("address", None)) ) reset = hex(RegElement.get("reset-val", "")) - size = int(data["hart0"].get("supported_xlen", "")[0]) + access = RegElement.get("priv_mode", "") if Registers.get(register, {}).get("description", "") is not None: desc = Registers.get(register, {}).get("description", "") @@ -1082,6 +1390,96 @@ def returnRegister(self, key, Extension_Name, Descr, instructions_data): return Inst +class SpikeParser: + """A class to parse data related to Spike.""" + + def __init__(self, srcFile, target): + self.srcFile = srcFile + self.target = target + + def returnDocument(self): + with open(self.srcFile, "r", encoding="utf-8") as f: + data = yaml.safe_load(f) + core_configs = [] + pattern = r"pmpaddr(\d+)" + index = 0 + bitWidth = 32 + isa = "" + for entry in data["hart_ids"]: + M = ( + "M" + if data[f"hart{entry}"] + .get("mstatus", {}) + .get("rv32", "") + .get("accessible", []) + else "" + ) + S = ( + "S" + if data[f"hart{entry}"] + .get("sstatus", {}) + .get("rv32", "") + .get("accessible", []) + else "" + ) + U = ( + "U" + if data[f"hart{entry}"] + .get("ustatus", {}) + .get("rv32", "") + .get("accessible", []) + else "" + ) + for k in data[f"hart{entry}"].keys(): + match = re.search(pattern, str(k)) + if match: + index += int(match.group(1)) + isa = data[f"hart{entry}"]["ISA"].lower() + core_config = CoreConfig( + isa=data[f"hart{entry}"]["ISA"].lower(), + marchid=data[f"hart{entry}"].get("marchid", {}).get("reset-val", ""), + misa_we=False, + misa_we_enable=True, + misaligned=data[f"hart{entry}"].get("hw_data_misaligned_support", ""), + mmu_mode=( + "bare" + if not ( + (int(data[f"hart{entry}"].get("satp", {}).get("reset-val", ""))) + >> 31 + ) + else "sv32" + ), + mvendorid=data[f"hart{entry}"] + .get("mvendorid", {}) + .get("reset-val", ""), + pmpaddr0=data[f"hart{entry}"].get("pmpaddr0", {}).get("reset-val", ""), + pmpcfg0=data[f"hart{entry}"].get("pmpcfg0", {}).get("reset-val", ""), + pmpregions=index, + priv=f"{M}{S}{U}".format(M, S, U), + status_fs_field_we=False, + status_fs_field_we_enable=False, + status_vs_field_we=False, + status_vs_field_we_enable=False, + ) + core_configs.append(core_config) + S = Spike( + bootrom=True, + bootrom_base=0x10000, + bootrom_size=0x1000, + dram=True, + dram_base=0x80000000, + dram_size=0x40000000, + generic_core_config=False, + max_steps=200000, + max_steps_enabled=False, + isa=isa, + priv=f"{M}{S}{U}".format(M, S, U), + core_configs=core_configs, + ) + + return S + + class IsaGenerator: """generate isa folder with isa docs""" @@ -1121,7 +1519,6 @@ def __init__(self, target): def write(self, file_name, string): path = f"./{self.target}/csr/" - print(path) if not os.path.exists(path): os.makedirs(path) _dest = os.path.join(path, file_name) @@ -1141,3 +1538,28 @@ def generateCSR(self, generatorClass, document): s = block.returnAsString() file_name = blockName + block.suffix self.write(file_name, s) + + +class SpikeGenerator: + """Generate spike folder with spike docs""" + + def __init__(self, target, temp, modiFile=None): + self.target = target + self.temp = temp + self.modiFile = modiFile + + def write(self, file_name, string): + path = f"./{self.target}/spike/" + if not os.path.exists(path): + os.makedirs(path) + _dest = os.path.join(path, file_name) + print("writing file " + _dest) + with open(_dest, "w", encoding="utf-8") as f: + yaml.dump(string, f, default_flow_style=False, sort_keys=False) + + def generateSpike(self, document): + template = Template(filename=self.temp) + s = template.render(spike=document) + data = spike_formatter(yaml.load(s, Loader=BaseLoader), self.modiFile) + file_name = "spike.yaml" + self.write(file_name, data) diff --git a/config/gen_from_riscv_config/scripts/riscv_config_gen.py b/config/gen_from_riscv_config/scripts/riscv_config_gen.py index 78cd82263c..ea5a58e555 100644 --- a/config/gen_from_riscv_config/scripts/riscv_config_gen.py +++ b/config/gen_from_riscv_config/scripts/riscv_config_gen.py @@ -13,42 +13,64 @@ # limitations under the License. # # Original Author: Oukalrazqou Abdessamii -""" Module is used to factorize multiples registers with the same name to - a specific format of registers """ +"""Module is used to factorize multiples registers with the same name to +a specific format of registers""" import argparse from libs.utils import CsrParser from libs.utils import IsaParser +from libs.utils import SpikeParser from libs.utils import IsaGenerator from libs.utils import CsrGenerator +from libs.utils import SpikeGenerator from libs.utils import RstAddressBlock +from libs.utils import AdocAddressBlock from libs.utils import MdAddressBlock from libs.utils import InstrstBlock +from libs.utils import InstadocBlock from libs.utils import InstmdBlock + if __name__ == "__main__": parser = argparse.ArgumentParser(description="GEN From RISC-V Config") parser.add_argument("-s", "--srcFile", help="isa_gen yaml input file") parser.add_argument("-c", "--customFile", help=" custom_gen yaml input file") - parser.add_argument("-d", "--destDir", help="write generated file to dir") + parser.add_argument("-d", "--debugFile", help=" debug_gen yaml input file") parser.add_argument("-m", "--modif", help="ISA /CSR Formatter if exist") parser.add_argument("-i", "--temp", help="Full ISA /SPIKETemplate") parser.add_argument("-t", "--target", help="Specifiy Config Name") + parser.add_argument("-f", "--format", help="Specifiy format output") args, unknown_args = parser.parse_known_args() + + if args.format in ['rst']: + C_instrBlock = InstrstBlock + C_AddressBlock = RstAddressBlock + elif args.format in ['adoc']: + C_instrBlock = InstadocBlock + C_AddressBlock = AdocAddressBlock + elif args.format in ['md']: + C_instrBlock = InstmdBlock + C_AddressBlock = MdAddressBlock + else: + C_instrBlock = InstrstBlock + C_AddressBlock = RstAddressBlock + if args.temp: if "isa" in args.temp: e = IsaParser(args.srcFile, args.temp, args.target, args.modif) document = e.returnDocument() generator = IsaGenerator(args.target) - generator.generateISA(InstrstBlock, document) + generator.generateISA(C_instrBlock, document) elif "spike" in args.temp: e = SpikeParser(args.srcFile, args.target) document = e.returnDocument() spike_generator = SpikeGenerator(args.target, args.temp, args.modif) spike_generator.generateSpike(document) else: - e = CsrParser(args.srcFile, args.customFile, args.target, args.modif) + e = CsrParser( + args.srcFile, args.customFile, args.debugFile, args.target, args.modif + ) document = e.returnDocument() generator = CsrGenerator(args.target) - generator.generateCSR(RstAddressBlock, document) + generator.generateCSR(C_AddressBlock, document) diff --git a/config/gen_from_riscv_config/templates/spike.mako b/config/gen_from_riscv_config/templates/spike.mako new file mode 100644 index 0000000000..e6aadd8cde --- /dev/null +++ b/config/gen_from_riscv_config/templates/spike.mako @@ -0,0 +1,53 @@ +# Copyright 2024 Thales DIS France SAS +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Original Author: Oukalrazqou Abdessamii + +<%! +def format_hex(value): + return f"0x{value:X}" + +def format_bool(value): + return "true" if value else "false" +%> +spike_param_tree: + bootrom: ${format_bool(spike.bootrom)} + bootrom_base: ${format_hex(spike.bootrom_base)} + bootrom_size: ${format_hex(spike.bootrom_size)} + dram: ${format_bool(spike.dram)} + dram_base: ${format_hex(spike.dram_base)} + dram_size: ${format_hex(spike.dram_size)} + generic_core_config: ${format_bool(spike.generic_core_config)} + max_steps: ${spike.max_steps} + max_steps_enabled: ${format_bool(spike.max_steps_enabled)} + isa: ${spike.isa} + priv: ${spike.priv} + core_configs: +% for core in spike.core_configs: + - isa: ${core.isa} + marchid: ${format_hex(core.marchid)} + misa_we: ${format_bool(core.misa_we)} + misa_we_enable: ${format_bool(core.misa_we_enable)} + misaligned: ${format_bool(core.misaligned)} + mmu_mode: ${core.mmu_mode} + mvendorid: ${format_hex(core.mvendorid)} + pmpaddr0: ${format_hex(core.pmpaddr0)} + pmpcfg0: ${format_hex(core.pmpcfg0)} + pmpregions: ${format_hex(core.pmpregions)} + priv: ${core.priv} + status_fs_field_we: ${format_bool(core.status_fs_field_we)} + status_fs_field_we_enable: ${format_bool(core.status_fs_field_we_enable)} + status_vs_field_we: ${format_bool(core.status_vs_field_we)} + status_vs_field_we_enable: ${format_bool(core.status_vs_field_we_enable)} +% endfor diff --git a/config/gen_from_riscv_config/updaters/cv32a65x/csr_updater.yaml b/config/gen_from_riscv_config/updaters/cv32a65x/csr_updater.yaml index 129ab53171..7070c4d87e 100644 --- a/config/gen_from_riscv_config/updaters/cv32a65x/csr_updater.yaml +++ b/config/gen_from_riscv_config/updaters/cv32a65x/csr_updater.yaml @@ -3,36 +3,1209 @@ # SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 # Author: Abdessamii Oukalrazqou -mip: - rv32: - meip: - type: - ro_variable: - - 0x0:0x1 - mtip: - type: - ro_variable: - - 0x0:0x1 -mie: - rv32: - meie: - type: - ro_variable: - - 0x0:0x1 - mtie: - type: - ro_variable: - - 0x0:0x1 -mstatus : - rv32 : - mie : - type : +mcause: + rv32: + accessible: true + interrupt: + implemented: true + description: Indicates whether the trap was due to an interrupt. + shadow: + shadow_type: rw + msb: 31 + lsb: 31 + type: wlrl: - 0x0:0x1 - mpie : - type: + exception_code: + implemented: true + description: Encodes the exception code. + shadow: + shadow_type: rw + msb: 30 + lsb: 0 + type: wlrl: - - 0x0:0x1 + - 0:8 , 11 + fields: + - exception_code + - interrupt +mip: + rv32: + accessible: true + usip: + implemented: false + description: User Software Interrupt Pending. + shadow: + shadow_type: rw + msb: 0 + lsb: 0 + ssip: + implemented: false + description: Supervisor Software Interrupt Pending. + shadow: + shadow_type: rw + msb: 1 + lsb: 1 + msip: + implemented: false + description: Machine Software Interrupt Pending. + shadow: + shadow_type: rw + msb: 3 + lsb: 3 + utip: + implemented: false + description: User Timer Interrupt Pending. + shadow: + shadow_type: rw + msb: 4 + lsb: 4 + stip: + implemented: false + description: Supervisor Timer Interrupt Pending. + shadow: + shadow_type: rw + msb: 5 + lsb: 5 + mtip: + implemented: true + description: Machine Timer Interrupt Pending. + shadow: + shadow_type: rw + msb: 7 + lsb: 7 + type: + ro_variable: [0:1] + ueip: + implemented: false + description: User External Interrupt Pending. + shadow: + shadow_type: rw + msb: 8 + lsb: 8 + seip: + implemented: false + description: Supervisor External Interrupt Pending. + shadow: + shadow_type: rw + msb: 9 + lsb: 9 + meip: + implemented: true + description: Machine External Interrupt Pending. + shadow: + shadow_type: rw + msb: 11 + lsb: 11 + type: + ro_variable: [0:1] + fields: + - usip + - ssip + - vssip + - msip + - utip + - stip + - vstip + - mtip + - ueip + - seip + - vseip + - meip + - sgeip + - + - + - 13 + - 31 + vssip: + implemented: false + description: VS-level Software Interrupt Pending. + shadow: + shadow_type: rw + msb: 2 + lsb: 2 + vstip: + implemented: false + description: VS-level Timer Interrupt Pending. + shadow: + shadow_type: rw + msb: 6 + lsb: 6 + vseip: + implemented: false + description: VS-level External Interrupt Pending. + shadow: + shadow_type: rw + msb: 10 + lsb: 10 + sgeip: + implemented: false + description: HS-level External Interrupt Pending. + shadow: + shadow_type: rw + msb: 12 + lsb: 12 +pmpcfg2: + rv32: + accessible: true + pmp8cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 7 + lsb: 0 + pmp9cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 15 + lsb: 8 + pmp10cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 23 + lsb: 16 + pmp11cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 31 + lsb: 24 + fields: + - pmp8cfg + - pmp9cfg + - pmp10cfg + - pmp11cfg +pmpcfg3: + rv32: + accessible: true + pmp12cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 7 + lsb: 0 + pmp13cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 15 + lsb: 8 + pmp14cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 23 + lsb: 16 + pmp15cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 31 + lsb: 24 + fields: + - pmp12cfg + - pmp13cfg + - pmp14cfg + - pmp15cfg + +pmpcfg5: + rv32: + accessible: true + pmp20cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 7 + lsb: 0 + pmp21cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 15 + lsb: 8 + pmp22cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 23 + lsb: 16 + pmp23cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 31 + lsb: 24 + fields: + - pmp20cfg + - pmp21cfg + - pmp22cfg + - pmp23cfg + +pmpcfg6: + rv32: + accessible: true + pmp24cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 7 + lsb: 0 + pmp25cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 15 + lsb: 8 + pmp26cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 23 + lsb: 16 + pmp27cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 31 + lsb: 24 + fields: + - pmp24cfg + - pmp25cfg + - pmp26cfg + - pmp27cfg + +pmpcfg7: + rv32: + accessible: true + pmp28cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 7 + lsb: 0 + pmp29cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 15 + lsb: 8 + pmp30cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 23 + lsb: 16 + pmp31cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 31 + lsb: 24 + fields: + - pmp28cfg + - pmp29cfg + - pmp30cfg + - pmp31cfg +pmpcfg8: + rv32: + accessible: true + pmp32cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 7 + lsb: 0 + pmp33cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 15 + lsb: 8 + pmp34cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 23 + lsb: 16 + pmp35cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 31 + lsb: 24 + fields: + - pmp32cfg + - pmp33cfg + - pmp34cfg + - pmp35cfg +pmpcfg9: + rv32: + accessible: true + pmp36cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 7 + lsb: 0 + pmp37cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 15 + lsb: 8 + pmp38cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 23 + lsb: 16 + pmp39cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 31 + lsb: 24 + fields: + - pmp36cfg + - pmp37cfg + - pmp38cfg + - pmp39cfg +pmpcfg10: + rv32: + accessible: true + pmp40cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 7 + lsb: 0 + pmp41cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 15 + lsb: 8 + pmp42cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 23 + lsb: 16 + pmp43cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 31 + lsb: 24 + fields: + - pmp40cfg + - pmp41cfg + - pmp42cfg + - pmp43cfg +pmpcfg11: + rv32: + accessible: true + pmp44cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 7 + lsb: 0 + pmp45cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 15 + lsb: 8 + pmp46cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 23 + lsb: 16 + pmp47cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 31 + lsb: 24 + fields: + - pmp44cfg + - pmp45cfg + - pmp46cfg + - pmp47cfg +pmpcfg12: + rv32: + accessible: true + pmp48cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 7 + lsb: 0 + pmp49cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 15 + lsb: 8 + pmp50cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 23 + lsb: 16 + pmp51cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 31 + lsb: 24 + fields: + - pmp48cfg + - pmp49cfg + - pmp50cfg + - pmp51cfg +pmpcfg13: + rv32: + accessible: true + pmp52cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 7 + lsb: 0 + pmp53cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 15 + lsb: 8 + pmp54cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 23 + lsb: 16 + pmp55cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 31 + lsb: 24 + fields: + - pmp52cfg + - pmp53cfg + - pmp54cfg + - pmp55cfg +pmpcfg14: + rv32: + accessible: true + pmp56cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 7 + lsb: 0 + pmp57cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 15 + lsb: 8 + pmp58cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 23 + lsb: 16 + pmp59cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 31 + lsb: 24 + fields: + - pmp56cfg + - pmp57cfg + - pmp58cfg + - pmp59cfg +pmpcfg15: + rv32: + accessible: true + pmp60cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 7 + lsb: 0 + pmp61cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 15 + lsb: 8 + pmp62cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 23 + lsb: 16 + pmp63cfg: + implemented: true + type: + ro_constant : 0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 31 + lsb: 24 + fields: + - pmp60cfg + - pmp61cfg + - pmp62cfg + - pmp63cfg + +#Adjust PMPADDR NUMBER FROM 15 TO 64 +pmpaddr16: + rv32: + accessible: true + type: + ro_constant: 0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 + +pmpaddr17: + rv32: + accessible: true + type: + ro_constant: 0 +pmpaddr18: + rv32: + accessible: true + type: + ro_constant: 0 + +pmpaddr19: + rv32: + accessible: true + type: + ro_constant: 0 + +pmpaddr20: + rv32: + accessible: true + type: + ro_constant: 0 + +pmpaddr21: + rv32: + accessible: true + type: + + ro_constant: 0 + +pmpaddr22: + rv32: + accessible: true + type: + ro_constant: 0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 +pmpaddr23: + rv32: + accessible: true + type: + ro_constant: 0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 + +pmpaddr24: + rv32: + accessible: true + type: + ro_constant: 0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 + +pmpaddr25: + rv32: + accessible: true + type: + ro_constant: 0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 +pmpaddr26: + rv32: + accessible: true + type: + ro_constant: 0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 + + +pmpaddr27: + rv32: + accessible: true + type: + ro_constant: 0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 + +pmpaddr28: + rv32: + accessible: true + type: + ro_constant: 0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 + +pmpaddr29: + rv32: + accessible: true + type: + ro_constant: 0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 + +pmpaddr30: + rv32: + accessible: true + type: + ro_constant: 0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 + + + +pmpaddr31: + rv32: + accessible: true + type: + ro_constant: 0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 +pmpaddr32: + rv32: + accessible: true + type: + ro_constant: 0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 + +pmpaddr33: + rv32: + accessible: true + type: + ro_constant: 0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 +pmpaddr34: + rv32: + accessible: true + type: + ro_constant: 0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 +pmpaddr35: + rv32: + accessible: true + type: + ro_constant: 0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 + rv64: + accessible: false + reset-val: 0 + description: Physical memory protection address register + address: 0x3D3 + priv_mode: M +pmpaddr36: + rv32: + accessible: true + type: + ro_constant: 0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 + +pmpaddr37: + rv32: + accessible: true + type: + ro_constant: 0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 + +pmpaddr38: + rv32: + accessible: true + type: + ro_constant: 0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 +pmpaddr39: + rv32: + accessible: true + type: + ro_constant: 0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 +pmpaddr40: + rv32: + accessible: true + type: + ro_constant: 0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 + +pmpaddr41: + rv32: + accessible: true + type: + ro_constant: 0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 + +pmpaddr42: + rv32: + accessible: true + type: + ro_constant: 0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 +pmpaddr43: + rv32: + accessible: true + type: + ro_constant: 0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 + +pmpaddr44: + rv32: + accessible: true + type: + ro_constant: 0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 +pmpaddr45: + rv32: + accessible: true + type: + ro_constant: 0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 +pmpaddr46: + rv32: + accessible: true + type: + ro_constant: 0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 +pmpaddr47: + rv32: + accessible: true + type: + ro_constant: 0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 +pmpaddr48: + rv32: + accessible: true + type: + ro_constant: 0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 +pmpaddr49: + rv32: + accessible: true + type: + ro_constant: 0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 + +pmpaddr50: + rv32: + accessible: true + type: + ro_constant: 0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 +pmpaddr51: + rv32: + accessible: true + type: + ro_constant: 0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 +pmpaddr52: + rv32: + accessible: true + type: + ro_constant: 0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 +pmpaddr53: + rv32: + accessible: true + type: + ro_constant: 0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 +pmpaddr54: + rv32: + accessible: true + type: + ro_constant: 0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 +pmpaddr55: + rv32: + accessible: true + type: + ro_constant: 0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 +pmpaddr56: + rv32: + accessible: true + type: + ro_constant: 0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 +pmpaddr57: + rv32: + accessible: true + type: + ro_constant: 0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 +pmpaddr58: + rv32: + accessible: true + type: + ro_constant: 0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 +pmpaddr59: + rv32: + accessible: true + type: + ro_constant: 0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 +pmpaddr60: + rv32: + accessible: true + type: + ro_constant: 0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 +pmpaddr61: + rv32: + accessible: true + type: + ro_constant: 0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 +pmpaddr62: + rv32: + accessible: true + type: + ro_constant: 0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 +pmpaddr63: + rv32: + accessible: true + type: + ro_constant: 0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 # Exclude mode exclude : key : priv_mode @@ -40,3 +1213,5 @@ exclude : exclude : key : priv_mode cond : U + + diff --git a/config/gen_from_riscv_config/updaters/cv32a65x/spike_updater.yaml b/config/gen_from_riscv_config/updaters/cv32a65x/spike_updater.yaml new file mode 100644 index 0000000000..9f81849ccf --- /dev/null +++ b/config/gen_from_riscv_config/updaters/cv32a65x/spike_updater.yaml @@ -0,0 +1,31 @@ +# Copyright (c) 2024 OpenHW Group +# Copyright (c) 2024 Thales +# SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 +# Author: Abdessamii Oukalrazqou + + + + + +cores: + - isa: rv32imc_zba_zbb_zbs_zbc_zicsr_zifencei + boot_addr: 0x80000000 + marchid: 0x3 + misa_we: false + misa_we_enable: true + pmpaddr0: 0x0 + pmpcfg0: 0x0 + pmpregions: 0x40 + usable_pmpregions : 0x8 + priv: M + status_fs_field_we: false + status_fs_field_we_enable: false + status_vs_field_we: false + status_vs_field_we_enable: false + priv: M + misa_we: false + mstatus_write_mask: 0x00000088 + mstatus_override_mask: 0x00001800 + mtval_write_mask: 0x00000000 + unified_traps: true + diff --git a/config/riscv-config/cv32a65x/generated/isa_gen.yaml b/config/riscv-config/cv32a65x/generated/isa_gen.yaml index 0b3ea13925..8aff6698f5 100644 --- a/config/riscv-config/cv32a65x/generated/isa_gen.yaml +++ b/config/riscv-config/cv32a65x/generated/isa_gen.yaml @@ -16,12 +16,12 @@ hart_ids: [0] hart0: - ISA: RV32IMCZicsr_Zicntr_Zifencei_Zcb_Zba_Zbb_Zbc_Zbs + ISA: RV32IMCZicsr_Zcb_Zba_Zbb_Zbc_Zbs User_Spec_Version: '2.3' supported_xlen: - 32 physical_addr_sz: 32 - pmp_granularity: 4 + pmp_granularity: 8 misa: reset-val: 0x40001106 rv32: @@ -2241,7 +2241,7 @@ hart0: warl: dependency_fields: [] legal: - - pmp0cfg[7:0] in [0x00:0xFF] + - pmp0cfg[7:0] bitmask [0x8f, 0x0] wr_illegal: - unchanged description: pmp configuration bits @@ -2255,7 +2255,7 @@ hart0: warl: dependency_fields: [] legal: - - pmp1cfg[7:0] in [0x00:0xFF] + - pmp1cfg[7:0] bitmask [0x8f, 0x0] wr_illegal: - unchanged description: pmp configuration bits @@ -2269,7 +2269,7 @@ hart0: warl: dependency_fields: [] legal: - - pmp2cfg[7:0] in [0x00:0xFF] + - pmp2cfg[7:0] bitmask [0x8f, 0x0] wr_illegal: - unchanged description: pmp configuration bits @@ -2283,7 +2283,7 @@ hart0: warl: dependency_fields: [] legal: - - pmp3cfg[7:0] in [0x00:0xFF] + - pmp3cfg[7:0] bitmask [0x8f, 0x0] wr_illegal: - unchanged description: pmp configuration bits @@ -2311,7 +2311,7 @@ hart0: warl: dependency_fields: [] legal: - - pmp4cfg[7:0] in [0x00:0xFF] + - pmp4cfg[7:0] bitmask [0x8f, 0x0] wr_illegal: - unchanged description: pmp configuration bits @@ -2325,7 +2325,7 @@ hart0: warl: dependency_fields: [] legal: - - pmp5cfg[7:0] in [0x00:0xFF] + - pmp5cfg[7:0] bitmask [0x8f, 0x0] wr_illegal: - unchanged description: pmp configuration bits @@ -2339,7 +2339,7 @@ hart0: warl: dependency_fields: [] legal: - - pmp6cfg[7:0] in [0x00:0xFF] + - pmp6cfg[7:0] bitmask [0x8f, 0x0] wr_illegal: - unchanged description: pmp configuration bits @@ -2353,7 +2353,7 @@ hart0: warl: dependency_fields: [] legal: - - pmp7cfg[7:0] in [0x00:0xFF] + - pmp7cfg[7:0] bitmask [0x8f, 0x0] wr_illegal: - unchanged description: pmp configuration bits @@ -2378,12 +2378,7 @@ hart0: pmp8cfg: implemented: true type: - warl: - dependency_fields: [] - legal: - - pmp8cfg[7:0] in [0x00:0xFF] - wr_illegal: - - unchanged + ro_constant: 0x0 description: pmp configuration bits shadow: shadow_type: rw @@ -2392,12 +2387,7 @@ hart0: pmp9cfg: implemented: true type: - warl: - dependency_fields: [] - legal: - - pmp9cfg[7:0] in [0x00:0xFF] - wr_illegal: - - unchanged + ro_constant: 0x0 description: pmp configuration bits shadow: shadow_type: rw @@ -2406,12 +2396,7 @@ hart0: pmp10cfg: implemented: true type: - warl: - dependency_fields: [] - legal: - - pmp10cfg[7:0] in [0x00:0xFF] - wr_illegal: - - unchanged + ro_constant: 0x0 description: pmp configuration bits shadow: shadow_type: rw @@ -2420,12 +2405,7 @@ hart0: pmp11cfg: implemented: true type: - warl: - dependency_fields: [] - legal: - - pmp11cfg[7:0] in [0x00:0xFF] - wr_illegal: - - unchanged + ro_constant: 0x0 description: pmp configuration bits shadow: shadow_type: rw @@ -2448,12 +2428,7 @@ hart0: pmp12cfg: implemented: true type: - warl: - dependency_fields: [] - legal: - - pmp12cfg[7:0] in [0x00:0xFF] - wr_illegal: - - unchanged + ro_constant: 0x0 description: pmp configuration bits shadow: shadow_type: rw @@ -2462,12 +2437,7 @@ hart0: pmp13cfg: implemented: true type: - warl: - dependency_fields: [] - legal: - - pmp13cfg[7:0] in [0x00:0xFF] - wr_illegal: - - unchanged + ro_constant: 0x0 description: pmp configuration bits shadow: shadow_type: rw @@ -2476,12 +2446,7 @@ hart0: pmp14cfg: implemented: true type: - warl: - dependency_fields: [] - legal: - - pmp14cfg[7:0] in [0x00:0xFF] - wr_illegal: - - unchanged + ro_constant: 0x0 description: pmp configuration bits shadow: shadow_type: rw @@ -2490,12 +2455,7 @@ hart0: pmp15cfg: implemented: true type: - warl: - dependency_fields: [] - legal: - - pmp15cfg[7:0] in [0x00:0xFF] - wr_illegal: - - unchanged + ro_constant: 0x0 description: pmp configuration bits shadow: shadow_type: rw @@ -2514,7 +2474,48 @@ hart0: priv_mode: M pmpcfg4: rv32: - accessible: false + accessible: true + pmp16cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 7 + lsb: 0 + pmp17cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 15 + lsb: 8 + pmp18cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 23 + lsb: 16 + pmp19cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 31 + lsb: 24 + fields: + - pmp16cfg + - pmp17cfg + - pmp18cfg + - pmp19cfg rv64: accessible: false reset-val: 0 @@ -2523,7 +2524,48 @@ hart0: priv_mode: M pmpcfg5: rv32: - accessible: false + accessible: true + pmp20cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 7 + lsb: 0 + pmp21cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 15 + lsb: 8 + pmp22cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 23 + lsb: 16 + pmp23cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 31 + lsb: 24 + fields: + - pmp20cfg + - pmp21cfg + - pmp22cfg + - pmp23cfg rv64: accessible: false reset-val: 0 @@ -2532,7 +2574,48 @@ hart0: priv_mode: M pmpcfg6: rv32: - accessible: false + accessible: true + pmp24cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 7 + lsb: 0 + pmp25cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 15 + lsb: 8 + pmp26cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 23 + lsb: 16 + pmp27cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 31 + lsb: 24 + fields: + - pmp24cfg + - pmp25cfg + - pmp26cfg + - pmp27cfg rv64: accessible: false reset-val: 0 @@ -2541,7 +2624,48 @@ hart0: priv_mode: M pmpcfg7: rv32: - accessible: false + accessible: true + pmp28cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 7 + lsb: 0 + pmp29cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 15 + lsb: 8 + pmp30cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 23 + lsb: 16 + pmp31cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 31 + lsb: 24 + fields: + - pmp28cfg + - pmp29cfg + - pmp30cfg + - pmp31cfg rv64: accessible: false reset-val: 0 @@ -2550,7 +2674,48 @@ hart0: priv_mode: M pmpcfg8: rv32: - accessible: false + accessible: true + pmp32cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 7 + lsb: 0 + pmp33cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 15 + lsb: 8 + pmp34cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 23 + lsb: 16 + pmp35cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 31 + lsb: 24 + fields: + - pmp32cfg + - pmp33cfg + - pmp34cfg + - pmp35cfg rv64: accessible: false reset-val: 0 @@ -2559,7 +2724,48 @@ hart0: priv_mode: M pmpcfg9: rv32: - accessible: false + accessible: true + pmp36cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 7 + lsb: 0 + pmp37cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 15 + lsb: 8 + pmp38cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 23 + lsb: 16 + pmp39cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 31 + lsb: 24 + fields: + - pmp36cfg + - pmp37cfg + - pmp38cfg + - pmp39cfg rv64: accessible: false reset-val: 0 @@ -2568,7 +2774,48 @@ hart0: priv_mode: M pmpcfg10: rv32: - accessible: false + accessible: true + pmp40cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 7 + lsb: 0 + pmp41cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 15 + lsb: 8 + pmp42cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 23 + lsb: 16 + pmp43cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 31 + lsb: 24 + fields: + - pmp40cfg + - pmp41cfg + - pmp42cfg + - pmp43cfg rv64: accessible: false reset-val: 0 @@ -2577,7 +2824,48 @@ hart0: priv_mode: M pmpcfg11: rv32: - accessible: false + accessible: true + pmp44cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 7 + lsb: 0 + pmp45cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 15 + lsb: 8 + pmp46cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 23 + lsb: 16 + pmp47cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 31 + lsb: 24 + fields: + - pmp44cfg + - pmp45cfg + - pmp46cfg + - pmp47cfg rv64: accessible: false reset-val: 0 @@ -2586,7 +2874,48 @@ hart0: priv_mode: M pmpcfg12: rv32: - accessible: false + accessible: true + pmp48cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 7 + lsb: 0 + pmp49cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 15 + lsb: 8 + pmp50cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 23 + lsb: 16 + pmp51cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 31 + lsb: 24 + fields: + - pmp48cfg + - pmp49cfg + - pmp50cfg + - pmp51cfg rv64: accessible: false reset-val: 0 @@ -2595,7 +2924,48 @@ hart0: priv_mode: M pmpcfg13: rv32: - accessible: false + accessible: true + pmp52cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 7 + lsb: 0 + pmp53cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 15 + lsb: 8 + pmp54cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 23 + lsb: 16 + pmp55cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 31 + lsb: 24 + fields: + - pmp52cfg + - pmp53cfg + - pmp54cfg + - pmp55cfg rv64: accessible: false reset-val: 0 @@ -2604,7 +2974,48 @@ hart0: priv_mode: M pmpcfg14: rv32: - accessible: false + accessible: true + pmp56cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 7 + lsb: 0 + pmp57cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 15 + lsb: 8 + pmp58cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 23 + lsb: 16 + pmp59cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 31 + lsb: 24 + fields: + - pmp56cfg + - pmp57cfg + - pmp58cfg + - pmp59cfg rv64: accessible: false reset-val: 0 @@ -2613,7 +3024,48 @@ hart0: priv_mode: M pmpcfg15: rv32: - accessible: false + accessible: true + pmp60cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 7 + lsb: 0 + pmp61cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 15 + lsb: 8 + pmp62cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 23 + lsb: 16 + pmp63cfg: + implemented: true + type: + ro_constant: 0x0 + description: pmp configuration bits + shadow: + shadow_type: rw + msb: 31 + lsb: 24 + fields: + - pmp60cfg + - pmp61cfg + - pmp62cfg + - pmp63cfg rv64: accessible: false reset-val: 0 @@ -3004,7 +3456,14 @@ hart0: priv_mode: M pmpaddr16: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3013,7 +3472,14 @@ hart0: priv_mode: M pmpaddr17: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3022,7 +3488,14 @@ hart0: priv_mode: M pmpaddr18: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3031,7 +3504,14 @@ hart0: priv_mode: M pmpaddr19: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3040,7 +3520,14 @@ hart0: priv_mode: M pmpaddr20: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3049,7 +3536,14 @@ hart0: priv_mode: M pmpaddr21: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3058,7 +3552,14 @@ hart0: priv_mode: M pmpaddr22: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3067,7 +3568,14 @@ hart0: priv_mode: M pmpaddr23: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3076,7 +3584,14 @@ hart0: priv_mode: M pmpaddr24: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3085,7 +3600,14 @@ hart0: priv_mode: M pmpaddr25: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3094,7 +3616,14 @@ hart0: priv_mode: M pmpaddr26: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3103,7 +3632,14 @@ hart0: priv_mode: M pmpaddr27: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3112,7 +3648,14 @@ hart0: priv_mode: M pmpaddr28: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3121,7 +3664,14 @@ hart0: priv_mode: M pmpaddr29: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3130,7 +3680,14 @@ hart0: priv_mode: M pmpaddr30: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3139,7 +3696,14 @@ hart0: priv_mode: M pmpaddr31: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3148,7 +3712,14 @@ hart0: priv_mode: M pmpaddr32: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3157,7 +3728,14 @@ hart0: priv_mode: M pmpaddr33: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3166,7 +3744,14 @@ hart0: priv_mode: M pmpaddr34: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3175,7 +3760,14 @@ hart0: priv_mode: M pmpaddr35: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3184,7 +3776,14 @@ hart0: priv_mode: M pmpaddr36: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3193,7 +3792,14 @@ hart0: priv_mode: M pmpaddr37: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3202,7 +3808,14 @@ hart0: priv_mode: M pmpaddr38: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3211,7 +3824,14 @@ hart0: priv_mode: M pmpaddr39: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3220,7 +3840,14 @@ hart0: priv_mode: M pmpaddr40: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3229,7 +3856,14 @@ hart0: priv_mode: M pmpaddr41: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3238,7 +3872,14 @@ hart0: priv_mode: M pmpaddr42: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3247,7 +3888,14 @@ hart0: priv_mode: M pmpaddr43: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3256,7 +3904,14 @@ hart0: priv_mode: M pmpaddr44: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3265,7 +3920,14 @@ hart0: priv_mode: M pmpaddr45: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3274,7 +3936,14 @@ hart0: priv_mode: M pmpaddr46: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3283,7 +3952,14 @@ hart0: priv_mode: M pmpaddr47: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3292,7 +3968,14 @@ hart0: priv_mode: M pmpaddr48: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3301,7 +3984,14 @@ hart0: priv_mode: M pmpaddr49: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3310,7 +4000,14 @@ hart0: priv_mode: M pmpaddr50: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3319,7 +4016,14 @@ hart0: priv_mode: M pmpaddr51: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3328,7 +4032,14 @@ hart0: priv_mode: M pmpaddr52: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3337,7 +4048,14 @@ hart0: priv_mode: M pmpaddr53: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3346,7 +4064,14 @@ hart0: priv_mode: M pmpaddr54: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3355,7 +4080,14 @@ hart0: priv_mode: M pmpaddr55: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3364,7 +4096,14 @@ hart0: priv_mode: M pmpaddr56: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3373,7 +4112,14 @@ hart0: priv_mode: M pmpaddr57: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3382,7 +4128,14 @@ hart0: priv_mode: M pmpaddr58: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3391,7 +4144,14 @@ hart0: priv_mode: M pmpaddr59: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3400,7 +4160,14 @@ hart0: priv_mode: M pmpaddr60: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3409,7 +4176,14 @@ hart0: priv_mode: M pmpaddr61: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3418,7 +4192,14 @@ hart0: priv_mode: M pmpaddr62: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 @@ -3427,7 +4208,14 @@ hart0: priv_mode: M pmpaddr63: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 + fields: [] + shadow: + shadow_type: rw + msb: 31 + lsb: 0 rv64: accessible: false reset-val: 0 diff --git a/config/riscv-config/cv32a65x/generated/platform_gen.yaml b/config/riscv-config/cv32a65x/generated/platform_gen.yaml index f6bd4ca837..3bcee8f92e 100644 --- a/config/riscv-config/cv32a65x/generated/platform_gen.yaml +++ b/config/riscv-config/cv32a65x/generated/platform_gen.yaml @@ -21,6 +21,27 @@ reset: mtime: implemented: true address: 0x20000 +memory_map: + - + memory_region: + name: bootrom + base_addr: 0x10000 + size: 0x10000 + description: System boot ROM + attributes: + read_only: true + cached: false + - + memory_region: + name: dram + base_addr: 0x80000000 + size: 0x40000000 + description: System (D)RAM + attributes: + executable: true + cached: true + non_idempotent: false + read_only: false mtimecmp: implemented: false mtval_condition_writes: diff --git a/config/riscv-config/cv32a65x/spec/isa_spec.yaml b/config/riscv-config/cv32a65x/spec/isa_spec.yaml index 9b25133aaa..4b8c766a1c 100644 --- a/config/riscv-config/cv32a65x/spec/isa_spec.yaml +++ b/config/riscv-config/cv32a65x/spec/isa_spec.yaml @@ -16,11 +16,11 @@ hart_ids: [0] hart0: &hart0 - ISA: RV32IMCZicsr_Zicntr_Zifencei_Zcb_Zba_Zbb_Zbc_Zbs + ISA: RV32IMCZicsr_Zcb_Zba_Zbb_Zbc_Zbs User_Spec_Version: '2.3' supported_xlen: [32] physical_addr_sz: 32 - pmp_granularity: 4 + pmp_granularity: 8 misa: reset-val: 0x40001106 # B: bit 1, C: bit 2, I = bit 8, M = bit 12, Z = bit 25 rv32: @@ -998,34 +998,34 @@ hart0: &hart0 warl: dependency_fields: [] legal: - - pmp0cfg[7:0] in [0x00:0xFF] + - pmp0cfg[7:0] bitmask [0x8f, 0x0] wr_illegal: - unchanged pmp1cfg: implemented: true - type: + type: warl: dependency_fields: [] legal: - - pmp1cfg[7:0] in [0x00:0xFF] + - pmp1cfg[7:0] bitmask [0x8f, 0x0] wr_illegal: - unchanged pmp2cfg: implemented: true - type: + type: warl: dependency_fields: [] legal: - - pmp2cfg[7:0] in [0x00:0xFF] + - pmp2cfg[7:0] bitmask [0x8f, 0x0] wr_illegal: - unchanged pmp3cfg: implemented: true - type: + type: warl: dependency_fields: [] legal: - - pmp3cfg[7:0] in [0x00:0xFF] + - pmp3cfg[7:0] bitmask [0x8f, 0x0] wr_illegal: - unchanged rv64: @@ -1036,38 +1036,38 @@ hart0: &hart0 accessible: true pmp4cfg: implemented: true - type: + type: warl: dependency_fields: [] legal: - - pmp4cfg[7:0] in [0x00:0xFF] + - pmp4cfg[7:0] bitmask [0x8f, 0x0] wr_illegal: - unchanged pmp5cfg: implemented: true - type: + type: warl: dependency_fields: [] legal: - - pmp5cfg[7:0] in [0x00:0xFF] + - pmp5cfg[7:0] bitmask [0x8f, 0x0] wr_illegal: - unchanged pmp6cfg: implemented: true - type: + type: warl: dependency_fields: [] legal: - - pmp6cfg[7:0] in [0x00:0xFF] + - pmp6cfg[7:0] bitmask [0x8f, 0x0] wr_illegal: - unchanged pmp7cfg: implemented: true - type: + type: warl: dependency_fields: [] legal: - - pmp7cfg[7:0] in [0x00:0xFF] + - pmp7cfg[7:0] bitmask [0x8f, 0x0] wr_illegal: - unchanged rv64: @@ -1078,40 +1078,20 @@ hart0: &hart0 accessible: true pmp8cfg: implemented: true - type: - warl: - dependency_fields: [] - legal: - - pmp8cfg[7:0] in [0x00:0xFF] - wr_illegal: - - unchanged + type: + ro_constant: 0x0 pmp9cfg: implemented: true - type: - warl: - dependency_fields: [] - legal: - - pmp9cfg[7:0] in [0x00:0xFF] - wr_illegal: - - unchanged + type: + ro_constant: 0x0 pmp10cfg: implemented: true - type: - warl: - dependency_fields: [] - legal: - - pmp10cfg[7:0] in [0x00:0xFF] - wr_illegal: - - unchanged + type: + ro_constant: 0x0 pmp11cfg: implemented: true - type: - warl: - dependency_fields: [] - legal: - - pmp11cfg[7:0] in [0x00:0xFF] - wr_illegal: - - unchanged + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 @@ -1120,119 +1100,291 @@ hart0: &hart0 accessible: true pmp12cfg: implemented: true - type: - warl: - dependency_fields: [] - legal: - - pmp12cfg[7:0] in [0x00:0xFF] - wr_illegal: - - unchanged + type: + ro_constant: 0x0 pmp13cfg: implemented: true - type: - warl: - dependency_fields: [] - legal: - - pmp13cfg[7:0] in [0x00:0xFF] - wr_illegal: - - unchanged + type: + ro_constant: 0x0 pmp14cfg: implemented: true - type: - warl: - dependency_fields: [] - legal: - - pmp14cfg[7:0] in [0x00:0xFF] - wr_illegal: - - unchanged + type: + ro_constant: 0x0 pmp15cfg: implemented: true - type: - warl: - dependency_fields: [] - legal: - - pmp15cfg[7:0] in [0x00:0xFF] - wr_illegal: - - unchanged + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpcfg4: rv32: - accessible: false + accessible: true + pmp16cfg: + implemented: true + type: + ro_constant: 0x0 + pmp17cfg: + implemented: true + type: + ro_constant: 0x0 + pmp18cfg: + implemented: true + type: + ro_constant: 0x0 + pmp19cfg: + implemented: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpcfg5: rv32: - accessible: false + accessible: true + pmp20cfg: + implemented: true + type: + ro_constant: 0x0 + pmp21cfg: + implemented: true + type: + ro_constant: 0x0 + pmp22cfg: + implemented: true + type: + ro_constant: 0x0 + pmp23cfg: + implemented: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpcfg6: rv32: - accessible: false + accessible: true + pmp24cfg: + implemented: true + type: + ro_constant: 0x0 + pmp25cfg: + implemented: true + type: + ro_constant: 0x0 + pmp26cfg: + implemented: true + type: + ro_constant: 0x0 + pmp27cfg: + implemented: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpcfg7: rv32: - accessible: false + accessible: true + pmp28cfg: + implemented: true + type: + ro_constant: 0x0 + pmp29cfg: + implemented: true + type: + ro_constant: 0x0 + pmp30cfg: + implemented: true + type: + ro_constant: 0x0 + pmp31cfg: + implemented: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpcfg8: rv32: - accessible: false + accessible: true + pmp32cfg: + implemented: true + type: + ro_constant: 0x0 + pmp33cfg: + implemented: true + type: + ro_constant: 0x0 + pmp34cfg: + implemented: true + type: + ro_constant: 0x0 + pmp35cfg: + implemented: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpcfg9: rv32: - accessible: false + accessible: true + pmp36cfg: + implemented: true + type: + ro_constant: 0x0 + pmp37cfg: + implemented: true + type: + ro_constant: 0x0 + pmp38cfg: + implemented: true + type: + ro_constant: 0x0 + pmp39cfg: + implemented: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpcfg10: rv32: - accessible: false + accessible: true + pmp40cfg: + implemented: true + type: + ro_constant: 0x0 + pmp41cfg: + implemented: true + type: + ro_constant: 0x0 + pmp42cfg: + implemented: true + type: + ro_constant: 0x0 + pmp43cfg: + implemented: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpcfg11: rv32: - accessible: false + accessible: true + pmp44cfg: + implemented: true + type: + ro_constant: 0x0 + pmp45cfg: + implemented: true + type: + ro_constant: 0x0 + pmp46cfg: + implemented: true + type: + ro_constant: 0x0 + pmp47cfg: + implemented: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpcfg12: rv32: - accessible: false + accessible: true + pmp48cfg: + implemented: true + type: + ro_constant: 0x0 + pmp49cfg: + implemented: true + type: + ro_constant: 0x0 + pmp50cfg: + implemented: true + type: + ro_constant: 0x0 + pmp51cfg: + implemented: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpcfg13: rv32: - accessible: false + accessible: true + pmp52cfg: + implemented: true + type: + ro_constant: 0x0 + pmp53cfg: + implemented: true + type: + ro_constant: 0x0 + pmp54cfg: + implemented: true + type: + ro_constant: 0x0 + pmp55cfg: + implemented: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpcfg14: rv32: - accessible: false + accessible: true + pmp56cfg: + implemented: true + type: + ro_constant: 0x0 + pmp57cfg: + implemented: true + type: + ro_constant: 0x0 + pmp58cfg: + implemented: true + type: + ro_constant: 0x0 + pmp59cfg: + implemented: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpcfg15: rv32: - accessible: false + accessible: true + pmp60cfg: + implemented: true + type: + ro_constant: 0x0 + pmp61cfg: + implemented: true + type: + ro_constant: 0x0 + pmp62cfg: + implemented: true + type: + ro_constant: 0x0 + pmp63cfg: + implemented: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 mcycle: rv32: accessible: true - type: + type: warl: dependency_fields: [] legal: @@ -1245,7 +1397,7 @@ hart0: &hart0 minstret: rv32: accessible: true - type: + type: warl: dependency_fields: [] legal: @@ -1258,7 +1410,7 @@ hart0: &hart0 mcycleh: rv32: accessible: true - type: + type: warl: dependency_fields: [] legal: @@ -1271,7 +1423,7 @@ hart0: &hart0 minstreth: rv32: accessible: true - type: + type: warl: dependency_fields: [] legal: @@ -1284,7 +1436,7 @@ hart0: &hart0 pmpaddr0: rv32: accessible: true - type: + type: warl: dependency_fields: [] legal: @@ -1297,7 +1449,7 @@ hart0: &hart0 pmpaddr1: rv32: accessible: true - type: + type: warl: dependency_fields: [] legal: @@ -1310,7 +1462,7 @@ hart0: &hart0 pmpaddr2: rv32: accessible: true - type: + type: warl: dependency_fields: [] legal: @@ -1323,7 +1475,7 @@ hart0: &hart0 pmpaddr3: rv32: accessible: true - type: + type: warl: dependency_fields: [] legal: @@ -1336,7 +1488,7 @@ hart0: &hart0 pmpaddr4: rv32: accessible: true - type: + type: warl: dependency_fields: [] legal: @@ -1349,7 +1501,7 @@ hart0: &hart0 pmpaddr5: rv32: accessible: true - type: + type: warl: dependency_fields: [] legal: @@ -1362,7 +1514,7 @@ hart0: &hart0 pmpaddr6: rv32: accessible: true - type: + type: warl: dependency_fields: [] legal: @@ -1375,7 +1527,7 @@ hart0: &hart0 pmpaddr7: rv32: accessible: true - type: + type: warl: dependency_fields: [] legal: @@ -1388,7 +1540,7 @@ hart0: &hart0 pmpaddr8: rv32: accessible: true - type: + type: ro_constant: 0x0 rv64: accessible: false @@ -1396,7 +1548,7 @@ hart0: &hart0 pmpaddr9: rv32: accessible: true - type: + type: ro_constant: 0x0 rv64: accessible: false @@ -1404,7 +1556,7 @@ hart0: &hart0 pmpaddr10: rv32: accessible: true - type: + type: ro_constant: 0x0 rv64: accessible: false @@ -1412,7 +1564,7 @@ hart0: &hart0 pmpaddr11: rv32: accessible: true - type: + type: ro_constant: 0x0 rv64: accessible: false @@ -1420,7 +1572,7 @@ hart0: &hart0 pmpaddr12: rv32: accessible: true - type: + type: ro_constant: 0x0 rv64: accessible: false @@ -1428,7 +1580,7 @@ hart0: &hart0 pmpaddr13: rv32: accessible: true - type: + type: ro_constant: 0x0 rv64: accessible: false @@ -1436,7 +1588,7 @@ hart0: &hart0 pmpaddr14: rv32: accessible: true - type: + type: ro_constant: 0x0 rv64: accessible: false @@ -1444,296 +1596,392 @@ hart0: &hart0 pmpaddr15: rv32: accessible: true - type: + type: ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr16: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr17: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr18: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr19: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr20: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr21: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr22: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr23: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr24: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr25: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr26: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr27: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr28: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr29: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr30: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr31: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr32: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr33: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr34: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr35: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr36: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr37: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr38: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr39: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr40: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr41: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr42: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr43: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr44: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr45: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr46: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr47: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr48: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr49: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr50: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr51: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr52: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr53: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr54: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr55: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr56: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr57: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr58: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr59: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr60: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr61: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr62: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 pmpaddr63: rv32: - accessible: false + accessible: true + type: + ro_constant: 0x0 rv64: accessible: false reset-val: 0 diff --git a/config/riscv-config/cv32a65x/spec/platform_spec.yaml b/config/riscv-config/cv32a65x/spec/platform_spec.yaml index fd27af227c..36bee7f1d5 100644 --- a/config/riscv-config/cv32a65x/spec/platform_spec.yaml +++ b/config/riscv-config/cv32a65x/spec/platform_spec.yaml @@ -19,5 +19,19 @@ nmi: reset: label: reset_vector mtime: - implemented: True + implemented: true address: 0x20000 +memory_map: + - memory_region: + name: bootrom + base_addr: 0x10000 + size: 0x10000 + description: System boot ROM + attributes: + read_only: true + cached: false + - memory_region: + name: dram + base_addr: 0x80000000 + size: 0x40000000 + description: System (D)RAM diff --git a/core/Flist.cva6 b/core/Flist.cva6 index aa885261a1..b54cc3cb1b 100644 --- a/core/Flist.cva6 +++ b/core/Flist.cva6 @@ -71,11 +71,14 @@ ${CVA6_REPO_DIR}/core/include/instr_tracer_pkg.sv ${CVA6_REPO_DIR}/core/include/build_config_pkg.sv //CVXIF -${CVA6_REPO_DIR}/core/include/cvxif_pkg.sv +${CVA6_REPO_DIR}/core/cvxif_compressed_if_driver.sv +${CVA6_REPO_DIR}/core/cvxif_issue_register_commit_if_driver.sv ${CVA6_REPO_DIR}/core/cvxif_example/include/cvxif_instr_pkg.sv ${CVA6_REPO_DIR}/core/cvxif_fu.sv ${CVA6_REPO_DIR}/core/cvxif_example/cvxif_example_coprocessor.sv ${CVA6_REPO_DIR}/core/cvxif_example/instr_decoder.sv +${CVA6_REPO_DIR}/core/cvxif_example/compressed_instr_decoder.sv +${CVA6_REPO_DIR}/core/cvxif_example/copro_alu.sv // Common Cells ${CVA6_REPO_DIR}/vendor/pulp-platform/common_cells/src/cf_math_pkg.sv @@ -126,7 +129,6 @@ ${CVA6_REPO_DIR}/core/ariane_regfile_ff.sv ${CVA6_REPO_DIR}/core/ariane_regfile_fpga.sv // NOTE: scoreboard.sv modified for DSIM (unchanged for other simulators) ${CVA6_REPO_DIR}/core/scoreboard.sv -${CVA6_REPO_DIR}/core/round_interval.sv ${CVA6_REPO_DIR}/core/store_buffer.sv ${CVA6_REPO_DIR}/core/amo_buffer.sv ${CVA6_REPO_DIR}/core/store_unit.sv @@ -161,8 +163,6 @@ ${CVA6_REPO_DIR}/core/cache_subsystem/cva6_icache_axi_wrapper.sv ${CVA6_REPO_DIR}/core/cache_subsystem/std_cache_subsystem.sv ${CVA6_REPO_DIR}/core/cache_subsystem/std_nbdcache.sv -F ${HPDCACHE_DIR}/rtl/hpdcache.Flist -${HPDCACHE_DIR}/rtl/src/utils/hpdcache_mem_req_read_arbiter.sv -${HPDCACHE_DIR}/rtl/src/utils/hpdcache_mem_req_write_arbiter.sv ${HPDCACHE_DIR}/rtl/src/utils/hpdcache_mem_resp_demux.sv ${HPDCACHE_DIR}/rtl/src/utils/hpdcache_mem_to_axi_read.sv ${HPDCACHE_DIR}/rtl/src/utils/hpdcache_mem_to_axi_write.sv diff --git a/core/Flist.cva6_gate b/core/Flist.cva6_gate index 4b743fa19c..ecc3f0b405 100644 --- a/core/Flist.cva6_gate +++ b/core/Flist.cva6_gate @@ -27,12 +27,11 @@ ${CVA6_REPO_DIR}/core/include/instr_tracer_pkg.sv ${CVA6_REPO_DIR}/core/include/build_config_pkg.sv //CVXIF -${CVA6_REPO_DIR}/core/include/cvxif_pkg.sv ${CVA6_REPO_DIR}/core/cvxif_example/include/cvxif_instr_pkg.sv -${CVA6_REPO_DIR}/core/cvxif_fu.sv ${CVA6_REPO_DIR}/core/cvxif_example/cvxif_example_coprocessor.sv ${CVA6_REPO_DIR}/core/cvxif_example/instr_decoder.sv -${CVA6_REPO_DIR}/core/cva6_fifo_v3.sv +${CVA6_REPO_DIR}/core/cvxif_example/compressed_instr_decoder.sv +${CVA6_REPO_DIR}/core/cvxif_example/copro_alu.sv // Common Cells diff --git a/core/acc_dispatcher.sv b/core/acc_dispatcher.sv index 4784564c84..d00e7eb5f0 100644 --- a/core/acc_dispatcher.sv +++ b/core/acc_dispatcher.sv @@ -42,7 +42,7 @@ module acc_dispatcher logic resp_valid; logic [CVA6Cfg.XLEN-1:0] result; logic [CVA6Cfg.TRANS_ID_BITS-1:0] trans_id; - logic error; + exception_t exception; // Metadata logic store_pending; logic store_complete; @@ -65,8 +65,8 @@ module acc_dispatcher // Interface with the CSRs input priv_lvl_t ld_st_priv_lvl_i, input logic sum_i, - input pmpcfg_t [CVA6Cfg.NrPMPEntries:0] pmpcfg_i, - input logic [CVA6Cfg.NrPMPEntries:0][CVA6Cfg.PLEN-3:0] pmpaddr_i, + input pmpcfg_t [CVA6Cfg.NrPMPEntries-1:0] pmpcfg_i, + input logic [CVA6Cfg.NrPMPEntries-1:0][CVA6Cfg.PLEN-3:0] pmpaddr_i, input logic [2:0] fcsr_frm_i, output logic dirty_v_state_o, // Interface with the issue stage @@ -285,7 +285,7 @@ module acc_dispatcher }; // Wait until the instruction is no longer speculative. acc_req_valid = insn_ready_q[acc_insn_queue_o.trans_id] || - (acc_commit && insn_pending_q[acc_commit_trans_id]); + (acc_commit && insn_pending_q[acc_commit_trans_id] && !flush_unissued_instr_i); acc_insn_queue_pop = acc_req_valid && acc_req_ready; end end @@ -297,30 +297,23 @@ module acc_dispatcher logic acc_ld_disp; logic acc_st_disp; + assign acc_trans_id_o = acc_resp_i.trans_id; + assign acc_result_o = acc_resp_i.result; + assign acc_valid_o = acc_resp_i.resp_valid; + assign acc_exception_o = acc_resp_i.exception; // Unpack the accelerator response - assign acc_trans_id_o = acc_resp_i.trans_id; - assign acc_result_o = acc_resp_i.result; - assign acc_valid_o = acc_resp_i.resp_valid; - assign acc_exception_o = '{ - cause: riscv::ILLEGAL_INSTR, - tval : '0, - tval2 : '0, - tinst : '0, - gva : '0, - valid: acc_resp_i.error - }; - assign acc_fflags_valid_o = acc_resp_i.fflags_valid; - assign acc_fflags_o = acc_resp_i.fflags; + assign acc_fflags_valid_o = acc_resp_i.fflags_valid; + assign acc_fflags_o = acc_resp_i.fflags; // Always ready to receive responses assign acc_req_o.resp_ready = 1'b1; // Signal dispatched load/store to issue stage - assign acc_ld_disp = acc_req_valid && (acc_insn_queue_o.operation == ACCEL_OP_LOAD); - assign acc_st_disp = acc_req_valid && (acc_insn_queue_o.operation == ACCEL_OP_STORE); + assign acc_ld_disp = acc_req_valid && (acc_insn_queue_o.operation == ACCEL_OP_LOAD); + assign acc_st_disp = acc_req_valid && (acc_insn_queue_o.operation == ACCEL_OP_STORE); // Cache invalidation - assign inval_valid_o = acc_resp_i.inval_valid; - assign inval_addr_o = acc_resp_i.inval_addr; + assign inval_valid_o = acc_resp_i.inval_valid; + assign inval_addr_o = acc_resp_i.inval_addr; /************************** * Accelerator commit * diff --git a/core/alu.sv b/core/alu.sv index 6e0f12ca1a..ce19579959 100644 --- a/core/alu.sv +++ b/core/alu.sv @@ -22,6 +22,7 @@ module alu import ariane_pkg::*; #( parameter config_pkg::cva6_cfg_t CVA6Cfg = config_pkg::cva6_cfg_empty, + parameter bit HasBranch = 1'b1, parameter type fu_data_t = logic ) ( // Subsystem Clock - SUBSYSTEM @@ -68,15 +69,7 @@ module alu logic [CVA6Cfg.XLEN-1:0] adder_result; logic [CVA6Cfg.XLEN-1:0] operand_a_bitmanip, bit_indx; - always_comb begin - adder_op_b_negate = 1'b0; - - unique case (fu_data_i.operation) - // ADDER OPS - EQ, NE, SUB, SUBW, ANDN, ORN, XNOR: adder_op_b_negate = 1'b1; - default: ; - endcase - end + assign adder_op_b_negate = fu_data_i.operation inside {EQ, NE, SUB, SUBW, ANDN, ORN, XNOR}; always_comb begin operand_a_bitmanip = fu_data_i.operand_a; @@ -110,21 +103,24 @@ module alu assign adder_in_b = operand_b_neg; // actual adder - assign adder_result_ext_o = $unsigned(adder_in_a) + $unsigned(adder_in_b); + assign adder_result_ext_o = adder_in_a + adder_in_b; assign adder_result = adder_result_ext_o[CVA6Cfg.XLEN:1]; assign adder_z_flag = ~|adder_result; // get the right branch comparison result - always_comb begin : branch_resolve - // set comparison by default - alu_branch_res_o = 1'b1; - case (fu_data_i.operation) - EQ: alu_branch_res_o = adder_z_flag; - NE: alu_branch_res_o = ~adder_z_flag; - LTS, LTU: alu_branch_res_o = less; - GES, GEU: alu_branch_res_o = ~less; - default: alu_branch_res_o = 1'b1; - endcase + if (HasBranch) begin + always_comb begin : branch_resolve + // set comparison by default + case (fu_data_i.operation) + EQ: alu_branch_res_o = adder_z_flag; + NE: alu_branch_res_o = ~adder_z_flag; + LTS, LTU: alu_branch_res_o = less; + GES, GEU: alu_branch_res_o = ~less; + default: alu_branch_res_o = 1'b1; + endcase + end + end else begin + assign alu_branch_res_o = 1'b0; end // --------- @@ -300,10 +296,10 @@ module alu if (CVA6Cfg.RVB) begin // Index for Bitwise Rotation bit_indx = 1 << (fu_data_i.operand_b & (CVA6Cfg.XLEN - 1)); - // rolw, roriw, rorw - rolw = ({{CVA6Cfg.XLEN-32{1'b0}},fu_data_i.operand_a[31:0]} << fu_data_i.operand_b[4:0]) | ({{CVA6Cfg.XLEN-32{1'b0}},fu_data_i.operand_a[31:0]} >> (CVA6Cfg.XLEN-32-fu_data_i.operand_b[4:0])); - rorw = ({{CVA6Cfg.XLEN-32{1'b0}},fu_data_i.operand_a[31:0]} >> fu_data_i.operand_b[4:0]) | ({{CVA6Cfg.XLEN-32{1'b0}},fu_data_i.operand_a[31:0]} << (CVA6Cfg.XLEN-32-fu_data_i.operand_b[4:0])); if (CVA6Cfg.IS_XLEN64) begin + // rolw, roriw, rorw + rolw = ({{CVA6Cfg.XLEN-32{1'b0}},fu_data_i.operand_a[31:0]} << fu_data_i.operand_b[4:0]) | ({{CVA6Cfg.XLEN-32{1'b0}},fu_data_i.operand_a[31:0]} >> (CVA6Cfg.XLEN-32-fu_data_i.operand_b[4:0])); + rorw = ({{CVA6Cfg.XLEN-32{1'b0}},fu_data_i.operand_a[31:0]} >> fu_data_i.operand_b[4:0]) | ({{CVA6Cfg.XLEN-32{1'b0}},fu_data_i.operand_a[31:0]} << (CVA6Cfg.XLEN-32-fu_data_i.operand_b[4:0])); unique case (fu_data_i.operation) CLZW, CTZW: result_o = (lz_tz_wempty) ? 32 : {{CVA6Cfg.XLEN - 5{1'b0}}, lz_tz_wcount}; // change diff --git a/core/branch_unit.sv b/core/branch_unit.sv index 4e74497d72..0688836639 100644 --- a/core/branch_unit.sv +++ b/core/branch_unit.sv @@ -108,8 +108,6 @@ module branch_unit #( always_comb begin : exception_handling // Do a jump if it is either unconditional jump (JAL | JALR) or `taken` conditional jump - jump_taken = !(ariane_pkg::op_is_branch(fu_data_i.operation)) || - ((ariane_pkg::op_is_branch(fu_data_i.operation)) && branch_comp_res_i); branch_exception_o.cause = riscv::INSTR_ADDR_MISALIGNED; branch_exception_o.valid = 1'b0; if (CVA6Cfg.TvalEn) @@ -121,6 +119,8 @@ module branch_unit #( // Only throw instruction address misaligned exception if this is indeed a `taken` conditional branch or // an unconditional jump if (!CVA6Cfg.RVC) begin + jump_taken = !(ariane_pkg::op_is_branch(fu_data_i.operation)) || + ((ariane_pkg::op_is_branch(fu_data_i.operation)) && branch_comp_res_i); if (branch_valid_i && (target_address[0] || target_address[1]) && jump_taken) begin branch_exception_o.valid = 1'b1; end diff --git a/core/cache_subsystem/axi_adapter.sv b/core/cache_subsystem/axi_adapter.sv index 47e12c9e61..3ee65421b7 100644 --- a/core/cache_subsystem/axi_adapter.sv +++ b/core/cache_subsystem/axi_adapter.sv @@ -201,7 +201,7 @@ module axi_adapter #( end else begin // bursts of AMOs unsupported assert (amo_i == ariane_pkg::AMO_NONE) - else $fatal("Bursts of atomic operations are not supported"); + else $fatal(1, "Bursts of atomic operations are not supported"); axi_req_o.aw.len = BURST_SIZE[7:0]; // number of bursts to do axi_req_o.w.data = wdata_i[0]; @@ -232,7 +232,7 @@ module axi_adapter #( gnt_o = axi_resp_i.ar_ready; if (type_i != ariane_pkg::SINGLE_REQ) begin assert (amo_i == ariane_pkg::AMO_NONE) - else $fatal("Bursts of atomic operations are not supported"); + else $fatal(1, "Bursts of atomic operations are not supported"); axi_req_o.ar.len = BURST_SIZE[7:0]; cnt_d = BURST_SIZE[ADDR_INDEX-1:0]; diff --git a/core/cache_subsystem/cache_ctrl.sv b/core/cache_subsystem/cache_ctrl.sv index 445c4927d6..8091e6a823 100644 --- a/core/cache_subsystem/cache_ctrl.sv +++ b/core/cache_subsystem/cache_ctrl.sv @@ -117,7 +117,8 @@ module cache_ctrl // cache-line offset -> multiple of XLEN cl_offset = mem_req_q.index[CVA6Cfg.DCACHE_OFFSET_WIDTH-1:$clog2(CVA6Cfg.XLEN/8)] << $clog2(CVA6Cfg.XLEN); // shift by log2(XLEN) to the left - axi_offset = '0; + // XLEN offset within AXI request + axi_offset = (mem_req_q.index >> $clog2(CVA6Cfg.XLEN / 8)) << $clog2(CVA6Cfg.XLEN); // default assignments state_d = state_q; mem_req_d = mem_req_q; @@ -138,11 +139,6 @@ module cache_ctrl mem_req_d.killed |= req_port_i.kill_req; - if (CVA6Cfg.XLEN == 32) begin - axi_offset = mem_req_q.index[$clog2(CVA6Cfg.AxiDataWidth/8)-1:$clog2(CVA6Cfg.XLEN/8)] << - $clog2(CVA6Cfg.XLEN); - end - case (state_q) IDLE: begin diff --git a/core/cache_subsystem/cva6_hpdcache_if_adapter.sv b/core/cache_subsystem/cva6_hpdcache_if_adapter.sv index 1b40134377..eb6f217ec8 100644 --- a/core/cache_subsystem/cva6_hpdcache_if_adapter.sv +++ b/core/cache_subsystem/cva6_hpdcache_if_adapter.sv @@ -78,29 +78,32 @@ module cva6_hpdcache_if_adapter ); // Request forwarding - assign hpdcache_req_valid_o = cva6_req_i.data_req, - hpdcache_req_o.addr_offset = cva6_req_i.address_index, - hpdcache_req_o.wdata = '0, - hpdcache_req_o.op = hpdcache_pkg::HPDCACHE_REQ_LOAD, - hpdcache_req_o.be = cva6_req_i.data_be, - hpdcache_req_o.size = cva6_req_i.data_size, - hpdcache_req_o.sid = hpdcache_req_sid_i, - hpdcache_req_o.tid = cva6_req_i.data_id, - hpdcache_req_o.need_rsp = 1'b1, - hpdcache_req_o.phys_indexed = 1'b0, - hpdcache_req_o.addr_tag = '0, // unused on virtually indexed request - hpdcache_req_o.pma = '0; // unused on virtually indexed request - - assign hpdcache_req_abort_o = cva6_req_i.kill_req, - hpdcache_req_tag_o = cva6_req_i.address_tag, - hpdcache_req_pma_o.uncacheable = hpdcache_req_is_uncacheable, - hpdcache_req_pma_o.io = 1'b0; + assign hpdcache_req_valid_o = cva6_req_i.data_req; + assign hpdcache_req_o.addr_offset = cva6_req_i.address_index; + assign hpdcache_req_o.wdata = '0; + assign hpdcache_req_o.op = hpdcache_pkg::HPDCACHE_REQ_LOAD; + assign hpdcache_req_o.be = cva6_req_i.data_be; + assign hpdcache_req_o.size = cva6_req_i.data_size; + assign hpdcache_req_o.sid = hpdcache_req_sid_i; + assign hpdcache_req_o.tid = cva6_req_i.data_id; + assign hpdcache_req_o.need_rsp = 1'b1; + assign hpdcache_req_o.phys_indexed = 1'b0; + assign hpdcache_req_o.addr_tag = '0; // unused on virtually indexed request + assign hpdcache_req_o.pma.uncacheable = 1'b0; + assign hpdcache_req_o.pma.io = 1'b0; + assign hpdcache_req_o.pma.wr_policy_hint = hpdcache_pkg::HPDCACHE_WR_POLICY_AUTO; + + assign hpdcache_req_abort_o = cva6_req_i.kill_req; + assign hpdcache_req_tag_o = cva6_req_i.address_tag; + assign hpdcache_req_pma_o.uncacheable = hpdcache_req_is_uncacheable; + assign hpdcache_req_pma_o.io = 1'b0; + assign hpdcache_req_pma_o.wr_policy_hint = hpdcache_pkg::HPDCACHE_WR_POLICY_AUTO; // Response forwarding - assign cva6_req_o.data_rvalid = hpdcache_rsp_valid_i, - cva6_req_o.data_rdata = hpdcache_rsp_i.rdata, - cva6_req_o.data_rid = hpdcache_rsp_i.tid, - cva6_req_o.data_gnt = hpdcache_req_ready_i; + assign cva6_req_o.data_rvalid = hpdcache_rsp_valid_i; + assign cva6_req_o.data_rdata = hpdcache_rsp_i.rdata; + assign cva6_req_o.data_rid = hpdcache_rsp_i.tid; + assign cva6_req_o.data_gnt = hpdcache_req_ready_i; end // }}} // {{{ @@ -176,9 +179,12 @@ module cva6_hpdcache_if_adapter assign hpdcache_req_o.addr_tag = forward_amo ? amo_tag : cva6_req_i.address_tag; assign hpdcache_req_o.pma.uncacheable = hpdcache_req_is_uncacheable; assign hpdcache_req_o.pma.io = 1'b0; + assign hpdcache_req_o.pma.wr_policy_hint = hpdcache_pkg::HPDCACHE_WR_POLICY_AUTO; assign hpdcache_req_abort_o = 1'b0; // unused on physically indexed requests assign hpdcache_req_tag_o = '0; // unused on physically indexed requests - assign hpdcache_req_pma_o = '0; // unused on physically indexed requests + assign hpdcache_req_pma_o.uncacheable = 1'b0; + assign hpdcache_req_pma_o.io = 1'b0; + assign hpdcache_req_pma_o.wr_policy_hint = hpdcache_pkg::HPDCACHE_WR_POLICY_AUTO; // }}} // Response forwarding diff --git a/core/cache_subsystem/cva6_hpdcache_subsystem.sv b/core/cache_subsystem/cva6_hpdcache_subsystem.sv index 80a00a5799..9cc5234385 100644 --- a/core/cache_subsystem/cva6_hpdcache_subsystem.sv +++ b/core/cache_subsystem/cva6_hpdcache_subsystem.sv @@ -216,11 +216,14 @@ module cva6_hpdcache_subsystem wbufDataEntries: CVA6Cfg.WtDcacheWbufDepth, wbufWords: 1, wbufTimecntWidth: 3, - wbufSendFeedThrough: 1'b0, rtabEntries: 4, + flushEntries: 0, + flushFifoDepth: 0, memAddrWidth: CVA6Cfg.AxiAddrWidth, memIdWidth: CVA6Cfg.MEM_TID_WIDTH, - memDataWidth: CVA6Cfg.AxiDataWidth + memDataWidth: CVA6Cfg.AxiDataWidth, + wtEn: 1'b1, + wbEn: 1'b0 }; localparam hpdcache_pkg::hpdcache_cfg_t HPDcacheCfg = hpdcache_pkg::hpdcacheBuildConfig( @@ -245,45 +248,25 @@ module cva6_hpdcache_subsystem typedef logic [HPDcacheCfg.u.wbufTimecntWidth-1:0] hpdcache_wbuf_timecnt_t; - logic dcache_miss_ready; - logic dcache_miss_valid; - hpdcache_mem_req_t dcache_miss; + logic dcache_read_ready; + logic dcache_read_valid; + hpdcache_mem_req_t dcache_read; - logic dcache_miss_resp_ready; - logic dcache_miss_resp_valid; - hpdcache_mem_resp_r_t dcache_miss_resp; + logic dcache_read_resp_ready; + logic dcache_read_resp_valid; + hpdcache_mem_resp_r_t dcache_read_resp; - logic dcache_wbuf_ready; - logic dcache_wbuf_valid; - hpdcache_mem_req_t dcache_wbuf; + logic dcache_write_ready; + logic dcache_write_valid; + hpdcache_mem_req_t dcache_write; - logic dcache_wbuf_data_ready; - logic dcache_wbuf_data_valid; - hpdcache_mem_req_w_t dcache_wbuf_data; + logic dcache_write_data_ready; + logic dcache_write_data_valid; + hpdcache_mem_req_w_t dcache_write_data; - logic dcache_wbuf_resp_ready; - logic dcache_wbuf_resp_valid; - hpdcache_mem_resp_w_t dcache_wbuf_resp; - - logic dcache_uc_read_ready; - logic dcache_uc_read_valid; - hpdcache_mem_req_t dcache_uc_read; - - logic dcache_uc_read_resp_ready; - logic dcache_uc_read_resp_valid; - hpdcache_mem_resp_r_t dcache_uc_read_resp; - - logic dcache_uc_write_ready; - logic dcache_uc_write_valid; - hpdcache_mem_req_t dcache_uc_write; - - logic dcache_uc_write_data_ready; - logic dcache_uc_write_data_valid; - hpdcache_mem_req_w_t dcache_uc_write_data; - - logic dcache_uc_write_resp_ready; - logic dcache_uc_write_resp_valid; - hpdcache_mem_resp_w_t dcache_uc_write_resp; + logic dcache_write_resp_ready; + logic dcache_write_resp_valid; + hpdcache_mem_resp_w_t dcache_write_resp; cva6_hpdcache_wrapper #( .CVA6Cfg(CVA6Cfg), @@ -339,46 +322,25 @@ module cva6_hpdcache_subsystem .hwpf_throttle_o(hwpf_throttle_o), .hwpf_status_o(hwpf_status_o), - .dcache_mem_req_miss_read_ready_i(dcache_miss_ready), - .dcache_mem_req_miss_read_valid_o(dcache_miss_valid), - .dcache_mem_req_miss_read_o(dcache_miss), - - .dcache_mem_resp_miss_read_ready_o(dcache_miss_resp_ready), - .dcache_mem_resp_miss_read_valid_i(dcache_miss_resp_valid), - .dcache_mem_resp_miss_read_i(dcache_miss_resp), - - .dcache_mem_req_wbuf_write_ready_i(dcache_wbuf_ready), - .dcache_mem_req_wbuf_write_valid_o(dcache_wbuf_valid), - .dcache_mem_req_wbuf_write_o(dcache_wbuf), - - .dcache_mem_req_wbuf_write_data_ready_i(dcache_wbuf_data_ready), - .dcache_mem_req_wbuf_write_data_valid_o(dcache_wbuf_data_valid), - .dcache_mem_req_wbuf_write_data_o(dcache_wbuf_data), + .dcache_mem_req_read_ready_i(dcache_read_ready), + .dcache_mem_req_read_valid_o(dcache_read_valid), + .dcache_mem_req_read_o(dcache_read), - .dcache_mem_resp_wbuf_write_ready_o(dcache_wbuf_resp_ready), - .dcache_mem_resp_wbuf_write_valid_i(dcache_wbuf_resp_valid), - .dcache_mem_resp_wbuf_write_i(dcache_wbuf_resp), + .dcache_mem_resp_read_ready_o(dcache_read_resp_ready), + .dcache_mem_resp_read_valid_i(dcache_read_resp_valid), + .dcache_mem_resp_read_i(dcache_read_resp), - .dcache_mem_req_uc_read_ready_i(dcache_uc_read_ready), - .dcache_mem_req_uc_read_valid_o(dcache_uc_read_valid), - .dcache_mem_req_uc_read_o(dcache_uc_read), + .dcache_mem_req_write_ready_i(dcache_write_ready), + .dcache_mem_req_write_valid_o(dcache_write_valid), + .dcache_mem_req_write_o(dcache_write), - .dcache_mem_resp_uc_read_ready_o(dcache_uc_read_resp_ready), - .dcache_mem_resp_uc_read_valid_i(dcache_uc_read_resp_valid), - .dcache_mem_resp_uc_read_i(dcache_uc_read_resp), - - .dcache_mem_req_uc_write_ready_i(dcache_uc_write_ready), - .dcache_mem_req_uc_write_valid_o(dcache_uc_write_valid), - .dcache_mem_req_uc_write_o(dcache_uc_write), - - .dcache_mem_req_uc_write_data_ready_i(dcache_uc_write_data_ready), - .dcache_mem_req_uc_write_data_valid_o(dcache_uc_write_data_valid), - .dcache_mem_req_uc_write_data_o(dcache_uc_write_data), - - .dcache_mem_resp_uc_write_ready_o(dcache_uc_write_resp_ready), - .dcache_mem_resp_uc_write_valid_i(dcache_uc_write_resp_valid), - .dcache_mem_resp_uc_write_i(dcache_uc_write_resp) + .dcache_mem_req_write_data_ready_i(dcache_write_data_ready), + .dcache_mem_req_write_data_valid_o(dcache_write_data_valid), + .dcache_mem_req_write_data_o(dcache_write_data), + .dcache_mem_resp_write_ready_o(dcache_write_resp_ready), + .dcache_mem_resp_write_valid_i(dcache_write_resp_valid), + .dcache_mem_resp_write_i(dcache_write_resp) ); // AXI arbiter instantiation @@ -416,47 +378,25 @@ module cva6_hpdcache_subsystem .icache_miss_resp_valid_o(icache_miss_resp_valid), .icache_miss_resp_o (icache_miss_resp), - .dcache_miss_ready_o(dcache_miss_ready), - .dcache_miss_valid_i(dcache_miss_valid), - .dcache_miss_i (dcache_miss), - - .dcache_miss_resp_ready_i(dcache_miss_resp_ready), - .dcache_miss_resp_valid_o(dcache_miss_resp_valid), - .dcache_miss_resp_o (dcache_miss_resp), - - .dcache_wbuf_ready_o(dcache_wbuf_ready), - .dcache_wbuf_valid_i(dcache_wbuf_valid), - .dcache_wbuf_i (dcache_wbuf), - - .dcache_wbuf_data_ready_o(dcache_wbuf_data_ready), - .dcache_wbuf_data_valid_i(dcache_wbuf_data_valid), - .dcache_wbuf_data_i (dcache_wbuf_data), - - .dcache_wbuf_resp_ready_i(dcache_wbuf_resp_ready), - .dcache_wbuf_resp_valid_o(dcache_wbuf_resp_valid), - .dcache_wbuf_resp_o (dcache_wbuf_resp), - - .dcache_uc_read_ready_o(dcache_uc_read_ready), - .dcache_uc_read_valid_i(dcache_uc_read_valid), - .dcache_uc_read_i (dcache_uc_read), - .dcache_uc_read_id_i ('1), + .dcache_read_ready_o(dcache_read_ready), + .dcache_read_valid_i(dcache_read_valid), + .dcache_read_i (dcache_read), - .dcache_uc_read_resp_ready_i(dcache_uc_read_resp_ready), - .dcache_uc_read_resp_valid_o(dcache_uc_read_resp_valid), - .dcache_uc_read_resp_o (dcache_uc_read_resp), + .dcache_read_resp_ready_i(dcache_read_resp_ready), + .dcache_read_resp_valid_o(dcache_read_resp_valid), + .dcache_read_resp_o (dcache_read_resp), - .dcache_uc_write_ready_o(dcache_uc_write_ready), - .dcache_uc_write_valid_i(dcache_uc_write_valid), - .dcache_uc_write_i (dcache_uc_write), - .dcache_uc_write_id_i ('1), + .dcache_write_ready_o(dcache_write_ready), + .dcache_write_valid_i(dcache_write_valid), + .dcache_write_i (dcache_write), - .dcache_uc_write_data_ready_o(dcache_uc_write_data_ready), - .dcache_uc_write_data_valid_i(dcache_uc_write_data_valid), - .dcache_uc_write_data_i (dcache_uc_write_data), + .dcache_write_data_ready_o(dcache_write_data_ready), + .dcache_write_data_valid_i(dcache_write_data_valid), + .dcache_write_data_i (dcache_write_data), - .dcache_uc_write_resp_ready_i(dcache_uc_write_resp_ready), - .dcache_uc_write_resp_valid_o(dcache_uc_write_resp_valid), - .dcache_uc_write_resp_o (dcache_uc_write_resp), + .dcache_write_resp_ready_i(dcache_write_resp_ready), + .dcache_write_resp_valid_o(dcache_write_resp_valid), + .dcache_write_resp_o (dcache_write_resp), .axi_req_o (noc_req_o), .axi_resp_i(noc_resp_i) diff --git a/core/cache_subsystem/cva6_hpdcache_subsystem_axi_arbiter.sv b/core/cache_subsystem/cva6_hpdcache_subsystem_axi_arbiter.sv index 48ee3f5d9c..901175f79c 100644 --- a/core/cache_subsystem/cva6_hpdcache_subsystem_axi_arbiter.sv +++ b/core/cache_subsystem/cva6_hpdcache_subsystem_axi_arbiter.sv @@ -58,50 +58,27 @@ module cva6_hpdcache_subsystem_axi_arbiter // Interfaces from/to D$ // {{{ - output logic dcache_miss_ready_o, - input logic dcache_miss_valid_i, - input hpdcache_mem_req_t dcache_miss_i, - - input logic dcache_miss_resp_ready_i, - output logic dcache_miss_resp_valid_o, - output hpdcache_mem_resp_r_t dcache_miss_resp_o, - - // Write-buffer write interface - output logic dcache_wbuf_ready_o, - input logic dcache_wbuf_valid_i, - input hpdcache_mem_req_t dcache_wbuf_i, - - output logic dcache_wbuf_data_ready_o, - input logic dcache_wbuf_data_valid_i, - input hpdcache_mem_req_w_t dcache_wbuf_data_i, - - input logic dcache_wbuf_resp_ready_i, - output logic dcache_wbuf_resp_valid_o, - output hpdcache_mem_resp_w_t dcache_wbuf_resp_o, - - // Uncached read interface - output logic dcache_uc_read_ready_o, - input logic dcache_uc_read_valid_i, - input hpdcache_mem_req_t dcache_uc_read_i, - input hpdcache_mem_id_t dcache_uc_read_id_i, - - input logic dcache_uc_read_resp_ready_i, - output logic dcache_uc_read_resp_valid_o, - output hpdcache_mem_resp_r_t dcache_uc_read_resp_o, - - // Uncached write interface - output logic dcache_uc_write_ready_o, - input logic dcache_uc_write_valid_i, - input hpdcache_mem_req_t dcache_uc_write_i, - input hpdcache_mem_id_t dcache_uc_write_id_i, - - output logic dcache_uc_write_data_ready_o, - input logic dcache_uc_write_data_valid_i, - input hpdcache_mem_req_w_t dcache_uc_write_data_i, - - input logic dcache_uc_write_resp_ready_i, - output logic dcache_uc_write_resp_valid_o, - output hpdcache_mem_resp_w_t dcache_uc_write_resp_o, + // Read interface + output logic dcache_read_ready_o, + input logic dcache_read_valid_i, + input hpdcache_mem_req_t dcache_read_i, + + input logic dcache_read_resp_ready_i, + output logic dcache_read_resp_valid_o, + output hpdcache_mem_resp_r_t dcache_read_resp_o, + + // Write interface + output logic dcache_write_ready_o, + input logic dcache_write_valid_i, + input hpdcache_mem_req_t dcache_write_i, + + output logic dcache_write_data_ready_o, + input logic dcache_write_data_valid_i, + input hpdcache_mem_req_w_t dcache_write_data_i, + + input logic dcache_write_resp_ready_i, + output logic dcache_write_resp_valid_o, + output hpdcache_mem_resp_w_t dcache_write_resp_o, // }}} // AXI port to upstream memory/peripherals @@ -166,13 +143,13 @@ module cva6_hpdcache_subsystem_axi_arbiter assign icache_miss_req_w = icache_miss_valid_i, icache_miss_ready_o = icache_miss_req_wok; - assign icache_miss_req_wdata.mem_req_addr = icache_miss_i.paddr, - icache_miss_req_wdata.mem_req_len = icache_miss_i.nc ? 0 : ICACHE_MEM_REQ_CL_LEN - 1, - icache_miss_req_wdata.mem_req_size = icache_miss_i.nc ? ICACHE_WORD_SIZE : ICACHE_MEM_REQ_CL_SIZE, - icache_miss_req_wdata.mem_req_id = icache_miss_i.tid, - icache_miss_req_wdata.mem_req_command = hpdcache_pkg::HPDCACHE_MEM_READ, - icache_miss_req_wdata.mem_req_atomic = hpdcache_pkg::hpdcache_mem_atomic_e'(0), - icache_miss_req_wdata.mem_req_cacheable = ~icache_miss_i.nc; + assign icache_miss_req_wdata.mem_req_addr = icache_miss_i.paddr; + assign icache_miss_req_wdata.mem_req_len = icache_miss_i.nc ? 0 : ICACHE_MEM_REQ_CL_LEN - 1; + assign icache_miss_req_wdata.mem_req_size = icache_miss_i.nc ? ICACHE_WORD_SIZE : ICACHE_MEM_REQ_CL_SIZE; + assign icache_miss_req_wdata.mem_req_id = icache_miss_i.tid; + assign icache_miss_req_wdata.mem_req_command = hpdcache_pkg::HPDCACHE_MEM_READ; + assign icache_miss_req_wdata.mem_req_atomic = hpdcache_pkg::hpdcache_mem_atomic_e'(0); + assign icache_miss_req_wdata.mem_req_cacheable = ~icache_miss_i.nc; // I$ response @@ -258,12 +235,12 @@ module cva6_hpdcache_subsystem_axi_arbiter end endgenerate - assign icache_miss_resp_valid_o = icache_miss_resp_meta_rok, - icache_miss_resp_o.rtype = wt_cache_pkg::ICACHE_IFILL_ACK, - icache_miss_resp_o.user = '0, - icache_miss_resp_o.inv = '0, - icache_miss_resp_o.tid = icache_miss_resp_meta_id, - icache_miss_resp_o.data = icache_miss_rdata; + assign icache_miss_resp_valid_o = icache_miss_resp_meta_rok; + assign icache_miss_resp_o.rtype = wt_cache_pkg::ICACHE_IFILL_ACK; + assign icache_miss_resp_o.user = '0; + assign icache_miss_resp_o.inv = '0; + assign icache_miss_resp_o.tid = icache_miss_resp_meta_id; + assign icache_miss_resp_o.data = icache_miss_rdata; // consume the Icache miss on the arrival of the response. The request // metadata is decoded to forward the correct word in case of uncacheable @@ -273,27 +250,23 @@ module cva6_hpdcache_subsystem_axi_arbiter // Read request arbiter // {{{ - logic mem_req_read_ready [2:0]; - logic mem_req_read_valid [2:0]; - hpdcache_mem_req_t mem_req_read [2:0]; + logic [1:0] mem_req_read_ready; + logic [1:0] mem_req_read_valid; + hpdcache_mem_req_t [1:0] mem_req_read; - logic mem_req_read_ready_arb; - logic mem_req_read_valid_arb; - hpdcache_mem_req_t mem_req_read_arb; + logic mem_req_read_ready_arb; + logic mem_req_read_valid_arb; + hpdcache_mem_req_t mem_req_read_arb; - assign mem_req_read_valid[0] = icache_miss_req_rok & ~icache_miss_pending_q, - mem_req_read[0] = icache_miss_req_rdata; + assign mem_req_read_valid[0] = icache_miss_req_rok & ~icache_miss_pending_q; + assign mem_req_read[0] = icache_miss_req_rdata; - assign dcache_miss_ready_o = mem_req_read_ready[1], - mem_req_read_valid[1] = dcache_miss_valid_i, - mem_req_read[1] = dcache_miss_i; - - assign dcache_uc_read_ready_o = mem_req_read_ready[2], - mem_req_read_valid[2] = dcache_uc_read_valid_i, - mem_req_read[2] = dcache_uc_read_i; + assign dcache_read_ready_o = mem_req_read_ready[1]; + assign mem_req_read_valid[1] = dcache_read_valid_i; + assign mem_req_read[1] = dcache_read_i; hpdcache_mem_req_read_arbiter #( - .N (3), + .N (2), .hpdcache_mem_req_t(hpdcache_mem_req_t) ) i_mem_req_read_arbiter ( .clk_i, @@ -315,21 +288,20 @@ module cva6_hpdcache_subsystem_axi_arbiter logic mem_resp_read_valid; hpdcache_mem_resp_r_t mem_resp_read; - logic mem_resp_read_ready_arb[2:0]; - logic mem_resp_read_valid_arb[2:0]; - hpdcache_mem_resp_r_t mem_resp_read_arb [2:0]; + logic mem_resp_read_ready_arb[1:0]; + logic mem_resp_read_valid_arb[1:0]; + hpdcache_mem_resp_r_t mem_resp_read_arb [1:0]; mem_resp_rt_t mem_resp_read_rt; always_comb begin for (int i = 0; i < MEM_RESP_RT_DEPTH; i++) begin - mem_resp_read_rt[i] = (i == int'( icache_miss_id_i)) ? 0 : - (i == int'(dcache_uc_read_id_i)) ? 2 : 1; + mem_resp_read_rt[i] = (i == int'(icache_miss_id_i)) ? 0 : 1; end end hpdcache_mem_resp_demux #( - .N (3), + .N (2), .resp_t (hpdcache_mem_resp_r_t), .resp_id_t(hpdcache_mem_id_t) ) i_mem_resp_read_demux ( @@ -348,124 +320,13 @@ module cva6_hpdcache_subsystem_axi_arbiter .mem_resp_rt_i(mem_resp_read_rt) ); - assign icache_miss_resp_w = mem_resp_read_valid_arb[0], - icache_miss_resp_wdata = mem_resp_read_arb[0], - mem_resp_read_ready_arb[0] = icache_miss_resp_wok; - - assign dcache_miss_resp_valid_o = mem_resp_read_valid_arb[1], - dcache_miss_resp_o = mem_resp_read_arb[1], - mem_resp_read_ready_arb[1] = dcache_miss_resp_ready_i; - - assign dcache_uc_read_resp_valid_o = mem_resp_read_valid_arb[2], - dcache_uc_read_resp_o = mem_resp_read_arb[2], - mem_resp_read_ready_arb[2] = dcache_uc_read_resp_ready_i; - // }}} - - // Write request arbiter - // {{{ - logic mem_req_write_ready [1:0]; - logic mem_req_write_valid [1:0]; - hpdcache_mem_req_t mem_req_write [1:0]; - - logic mem_req_write_data_ready [1:0]; - logic mem_req_write_data_valid [1:0]; - hpdcache_mem_req_w_t mem_req_write_data [1:0]; - - logic mem_req_write_ready_arb; - logic mem_req_write_valid_arb; - hpdcache_mem_req_t mem_req_write_arb; - - logic mem_req_write_data_ready_arb; - logic mem_req_write_data_valid_arb; - hpdcache_mem_req_w_t mem_req_write_data_arb; - - assign dcache_wbuf_ready_o = mem_req_write_ready[0], - mem_req_write_valid[0] = dcache_wbuf_valid_i, - mem_req_write[0] = dcache_wbuf_i; - - assign dcache_wbuf_data_ready_o = mem_req_write_data_ready[0], - mem_req_write_data_valid[0] = dcache_wbuf_data_valid_i, - mem_req_write_data[0] = dcache_wbuf_data_i; - - assign dcache_uc_write_ready_o = mem_req_write_ready[1], - mem_req_write_valid[1] = dcache_uc_write_valid_i, - mem_req_write[1] = dcache_uc_write_i; - - assign dcache_uc_write_data_ready_o = mem_req_write_data_ready[1], - mem_req_write_data_valid[1] = dcache_uc_write_data_valid_i, - mem_req_write_data[1] = dcache_uc_write_data_i; - - hpdcache_mem_req_write_arbiter #( - .N (2), - .hpdcache_mem_req_t (hpdcache_mem_req_t), - .hpdcache_mem_req_w_t(hpdcache_mem_req_w_t) - ) i_mem_req_write_arbiter ( - .clk_i, - .rst_ni, - - .mem_req_write_ready_o(mem_req_write_ready), - .mem_req_write_valid_i(mem_req_write_valid), - .mem_req_write_i (mem_req_write), - - .mem_req_write_data_ready_o(mem_req_write_data_ready), - .mem_req_write_data_valid_i(mem_req_write_data_valid), - .mem_req_write_data_i (mem_req_write_data), - - .mem_req_write_ready_i(mem_req_write_ready_arb), - .mem_req_write_valid_o(mem_req_write_valid_arb), - .mem_req_write_o (mem_req_write_arb), - - .mem_req_write_data_ready_i(mem_req_write_data_ready_arb), - .mem_req_write_data_valid_o(mem_req_write_data_valid_arb), - .mem_req_write_data_o (mem_req_write_data_arb) - ); - // }}} - - // Write response demultiplexor - // {{{ - logic mem_resp_write_ready; - logic mem_resp_write_valid; - hpdcache_mem_resp_w_t mem_resp_write; - - logic mem_resp_write_ready_arb[1:0]; - logic mem_resp_write_valid_arb[1:0]; - hpdcache_mem_resp_w_t mem_resp_write_arb [1:0]; - - mem_resp_rt_t mem_resp_write_rt; - - always_comb begin - for (int i = 0; i < MEM_RESP_RT_DEPTH; i++) begin - mem_resp_write_rt[i] = (i == int'(dcache_uc_write_id_i)) ? 1 : 0; - end - end - - hpdcache_mem_resp_demux #( - .N (2), - .resp_t (hpdcache_mem_resp_w_t), - .resp_id_t(hpdcache_mem_id_t) - ) i_hpdcache_mem_resp_write_demux ( - .clk_i, - .rst_ni, - - .mem_resp_ready_o(mem_resp_write_ready), - .mem_resp_valid_i(mem_resp_write_valid), - .mem_resp_id_i (mem_resp_write.mem_resp_w_id), - .mem_resp_i (mem_resp_write), - - .mem_resp_ready_i(mem_resp_write_ready_arb), - .mem_resp_valid_o(mem_resp_write_valid_arb), - .mem_resp_o (mem_resp_write_arb), - - .mem_resp_rt_i(mem_resp_write_rt) - ); - - assign dcache_wbuf_resp_valid_o = mem_resp_write_valid_arb[0], - dcache_wbuf_resp_o = mem_resp_write_arb[0], - mem_resp_write_ready_arb[0] = dcache_wbuf_resp_ready_i; + assign icache_miss_resp_w = mem_resp_read_valid_arb[0]; + assign icache_miss_resp_wdata = mem_resp_read_arb[0]; + assign mem_resp_read_ready_arb[0] = icache_miss_resp_wok; - assign dcache_uc_write_resp_valid_o = mem_resp_write_valid_arb[1], - dcache_uc_write_resp_o = mem_resp_write_arb[1], - mem_resp_write_ready_arb[1] = dcache_uc_write_resp_ready_i; + assign dcache_read_resp_valid_o = mem_resp_read_valid_arb[1]; + assign dcache_read_resp_o = mem_resp_read_arb[1]; + assign mem_resp_read_ready_arb[1] = dcache_read_resp_ready_i; // }}} // I$ miss pending @@ -491,17 +352,17 @@ module cva6_hpdcache_subsystem_axi_arbiter .w_chan_t (axi_w_chan_t), .b_chan_t (axi_b_chan_t) ) i_hpdcache_mem_to_axi_write ( - .req_ready_o(mem_req_write_ready_arb), - .req_valid_i(mem_req_write_valid_arb), - .req_i (mem_req_write_arb), + .req_ready_o(dcache_write_ready_o), + .req_valid_i(dcache_write_valid_i), + .req_i (dcache_write_i), - .req_data_ready_o(mem_req_write_data_ready_arb), - .req_data_valid_i(mem_req_write_data_valid_arb), - .req_data_i (mem_req_write_data_arb), + .req_data_ready_o(dcache_write_data_ready_o), + .req_data_valid_i(dcache_write_data_valid_i), + .req_data_i (dcache_write_data_i), - .resp_ready_i(mem_resp_write_ready), - .resp_valid_o(mem_resp_write_valid), - .resp_o (mem_resp_write), + .resp_ready_i(dcache_write_resp_ready_i), + .resp_valid_o(dcache_write_resp_valid_o), + .resp_o (dcache_write_resp_o), .axi_aw_valid_o(axi_req_o.aw_valid), .axi_aw_o (axi_req_o.aw), @@ -546,13 +407,13 @@ module cva6_hpdcache_subsystem_axi_arbiter // pragma translate_off initial assert (CVA6Cfg.MEM_TID_WIDTH <= AxiIdWidth) - else $fatal("MEM_TID_WIDTH shall be less or equal to AxiIdWidth"); + else $fatal(1, "MEM_TID_WIDTH shall be less or equal to AxiIdWidth"); initial assert (CVA6Cfg.AxiDataWidth <= CVA6Cfg.ICACHE_LINE_WIDTH) - else $fatal("AxiDataWidth shall be less or equal to the width of a Icache line"); + else $fatal(1, "AxiDataWidth shall be less or equal to the width of a Icache line"); initial assert (CVA6Cfg.AxiDataWidth <= CVA6Cfg.DCACHE_LINE_WIDTH) - else $fatal("AxiDataWidth shall be less or equal to the width of a Dcache line"); + else $fatal(1, "AxiDataWidth shall be less or equal to the width of a Dcache line"); // pragma translate_on // }}} diff --git a/core/cache_subsystem/cva6_hpdcache_wrapper.sv b/core/cache_subsystem/cva6_hpdcache_wrapper.sv index 1e532a2b5e..daa084bbcc 100644 --- a/core/cache_subsystem/cva6_hpdcache_wrapper.sv +++ b/core/cache_subsystem/cva6_hpdcache_wrapper.sv @@ -17,7 +17,7 @@ module cva6_hpdcache_wrapper // {{{ #( parameter config_pkg::cva6_cfg_t CVA6Cfg = config_pkg::cva6_cfg_empty, - parameter hpdcache_pkg::hpdcache_cfg_t HPDcacheCfg, + parameter hpdcache_pkg::hpdcache_cfg_t HPDcacheCfg = '0, parameter type dcache_req_i_t = logic, parameter type dcache_req_o_t = logic, parameter int NumPorts = 4, @@ -68,85 +68,52 @@ module cva6_hpdcache_wrapper // Load or store miss - PERF_COUNTERS output logic dcache_miss_o, - // AMO request - EX_STAGE + // AMO request/response - EX_STAGE input ariane_pkg::amo_req_t dcache_amo_req_i, - // AMO response - EX_STAGE output ariane_pkg::amo_resp_t dcache_amo_resp_o, - // CMO interface request - TO_BE_COMPLETED + // CMO interface request/response input cmo_req_t dcache_cmo_req_i, - // CMO interface response - TO_BE_COMPLETED output cmo_rsp_t dcache_cmo_resp_o, - // Data cache input request ports - EX_STAGE + // Data cache input request/response ports - EX_STAGE input dcache_req_i_t [NumPorts-1:0] dcache_req_ports_i, - // Data cache output request ports - EX_STAGE output dcache_req_o_t [NumPorts-1:0] dcache_req_ports_o, - // Write buffer status to know if empty - EX_STAGE + // Write buffer status - EX_STAGE output logic wbuffer_empty_o, - // Write buffer status to know if not non idempotent - EX_STAGE output logic wbuffer_not_ni_o, // Hardware memory prefetcher configuration - // TO_BE_COMPLETED - TO_BE_COMPLETED input logic [NrHwPrefetchers-1:0] hwpf_base_set_i, - // TO_BE_COMPLETED - TO_BE_COMPLETED input logic [NrHwPrefetchers-1:0][63:0] hwpf_base_i, - // TO_BE_COMPLETED - TO_BE_COMPLETED output logic [NrHwPrefetchers-1:0][63:0] hwpf_base_o, - // TO_BE_COMPLETED - TO_BE_COMPLETED input logic [NrHwPrefetchers-1:0] hwpf_param_set_i, - // TO_BE_COMPLETED - TO_BE_COMPLETED input logic [NrHwPrefetchers-1:0][63:0] hwpf_param_i, - // TO_BE_COMPLETED - TO_BE_COMPLETED output logic [NrHwPrefetchers-1:0][63:0] hwpf_param_o, - // TO_BE_COMPLETED - TO_BE_COMPLETED input logic [NrHwPrefetchers-1:0] hwpf_throttle_set_i, - // TO_BE_COMPLETED - TO_BE_COMPLETED input logic [NrHwPrefetchers-1:0][63:0] hwpf_throttle_i, - // TO_BE_COMPLETED - TO_BE_COMPLETED output logic [NrHwPrefetchers-1:0][63:0] hwpf_throttle_o, - // TO_BE_COMPLETED - TO_BE_COMPLETED output logic [ 63:0] hwpf_status_o, - input logic dcache_mem_req_miss_read_ready_i, - output logic dcache_mem_req_miss_read_valid_o, - output hpdcache_mem_req_t dcache_mem_req_miss_read_o, + input logic dcache_mem_req_read_ready_i, + output logic dcache_mem_req_read_valid_o, + output hpdcache_mem_req_t dcache_mem_req_read_o, - output logic dcache_mem_resp_miss_read_ready_o, - input logic dcache_mem_resp_miss_read_valid_i, - input hpdcache_mem_resp_r_t dcache_mem_resp_miss_read_i, + output logic dcache_mem_resp_read_ready_o, + input logic dcache_mem_resp_read_valid_i, + input hpdcache_mem_resp_r_t dcache_mem_resp_read_i, - input logic dcache_mem_req_wbuf_write_ready_i, - output logic dcache_mem_req_wbuf_write_valid_o, - output hpdcache_mem_req_t dcache_mem_req_wbuf_write_o, + input logic dcache_mem_req_write_ready_i, + output logic dcache_mem_req_write_valid_o, + output hpdcache_mem_req_t dcache_mem_req_write_o, - input logic dcache_mem_req_wbuf_write_data_ready_i, - output logic dcache_mem_req_wbuf_write_data_valid_o, - output hpdcache_mem_req_w_t dcache_mem_req_wbuf_write_data_o, + input logic dcache_mem_req_write_data_ready_i, + output logic dcache_mem_req_write_data_valid_o, + output hpdcache_mem_req_w_t dcache_mem_req_write_data_o, - output logic dcache_mem_resp_wbuf_write_ready_o, - input logic dcache_mem_resp_wbuf_write_valid_i, - input hpdcache_mem_resp_w_t dcache_mem_resp_wbuf_write_i, - - input logic dcache_mem_req_uc_read_ready_i, - output logic dcache_mem_req_uc_read_valid_o, - output hpdcache_mem_req_t dcache_mem_req_uc_read_o, - - output logic dcache_mem_resp_uc_read_ready_o, - input logic dcache_mem_resp_uc_read_valid_i, - input hpdcache_mem_resp_r_t dcache_mem_resp_uc_read_i, - - input logic dcache_mem_req_uc_write_ready_i, - output logic dcache_mem_req_uc_write_valid_o, - output hpdcache_mem_req_t dcache_mem_req_uc_write_o, - - input logic dcache_mem_req_uc_write_data_ready_i, - output logic dcache_mem_req_uc_write_data_valid_o, - output hpdcache_mem_req_w_t dcache_mem_req_uc_write_data_o, - - output logic dcache_mem_resp_uc_write_ready_o, - input logic dcache_mem_resp_uc_write_valid_i, - input hpdcache_mem_resp_w_t dcache_mem_resp_uc_write_i + output logic dcache_mem_resp_write_ready_o, + input logic dcache_mem_resp_write_valid_i, + input hpdcache_mem_resp_w_t dcache_mem_resp_write_i ); + localparam int HPDCACHE_NREQUESTERS = NumPorts + 2; typedef logic [63:0] hwpf_stride_param_t; @@ -400,45 +367,25 @@ module cva6_hpdcache_wrapper .core_rsp_valid_o(dcache_rsp_valid), .core_rsp_o (dcache_rsp), - .mem_req_miss_read_ready_i(dcache_mem_req_miss_read_ready_i), - .mem_req_miss_read_valid_o(dcache_mem_req_miss_read_valid_o), - .mem_req_miss_read_o (dcache_mem_req_miss_read_o), - - .mem_resp_miss_read_ready_o(dcache_mem_resp_miss_read_ready_o), - .mem_resp_miss_read_valid_i(dcache_mem_resp_miss_read_valid_i), - .mem_resp_miss_read_i (dcache_mem_resp_miss_read_i), - - .mem_req_wbuf_write_ready_i(dcache_mem_req_wbuf_write_ready_i), - .mem_req_wbuf_write_valid_o(dcache_mem_req_wbuf_write_valid_o), - .mem_req_wbuf_write_o (dcache_mem_req_wbuf_write_o), - - .mem_req_wbuf_write_data_ready_i(dcache_mem_req_wbuf_write_data_ready_i), - .mem_req_wbuf_write_data_valid_o(dcache_mem_req_wbuf_write_data_valid_o), - .mem_req_wbuf_write_data_o (dcache_mem_req_wbuf_write_data_o), - - .mem_resp_wbuf_write_ready_o(dcache_mem_resp_wbuf_write_ready_o), - .mem_resp_wbuf_write_valid_i(dcache_mem_resp_wbuf_write_valid_i), - .mem_resp_wbuf_write_i (dcache_mem_resp_wbuf_write_i), - - .mem_req_uc_read_ready_i(dcache_mem_req_uc_read_ready_i), - .mem_req_uc_read_valid_o(dcache_mem_req_uc_read_valid_o), - .mem_req_uc_read_o (dcache_mem_req_uc_read_o), + .mem_req_read_ready_i(dcache_mem_req_read_ready_i), + .mem_req_read_valid_o(dcache_mem_req_read_valid_o), + .mem_req_read_o (dcache_mem_req_read_o), - .mem_resp_uc_read_ready_o(dcache_mem_resp_uc_read_ready_o), - .mem_resp_uc_read_valid_i(dcache_mem_resp_uc_read_valid_i), - .mem_resp_uc_read_i (dcache_mem_resp_uc_read_i), + .mem_resp_read_ready_o(dcache_mem_resp_read_ready_o), + .mem_resp_read_valid_i(dcache_mem_resp_read_valid_i), + .mem_resp_read_i (dcache_mem_resp_read_i), - .mem_req_uc_write_ready_i(dcache_mem_req_uc_write_ready_i), - .mem_req_uc_write_valid_o(dcache_mem_req_uc_write_valid_o), - .mem_req_uc_write_o (dcache_mem_req_uc_write_o), + .mem_req_write_ready_i(dcache_mem_req_write_ready_i), + .mem_req_write_valid_o(dcache_mem_req_write_valid_o), + .mem_req_write_o (dcache_mem_req_write_o), - .mem_req_uc_write_data_ready_i(dcache_mem_req_uc_write_data_ready_i), - .mem_req_uc_write_data_valid_o(dcache_mem_req_uc_write_data_valid_o), - .mem_req_uc_write_data_o (dcache_mem_req_uc_write_data_o), + .mem_req_write_data_ready_i(dcache_mem_req_write_data_ready_i), + .mem_req_write_data_valid_o(dcache_mem_req_write_data_valid_o), + .mem_req_write_data_o (dcache_mem_req_write_data_o), - .mem_resp_uc_write_ready_o(dcache_mem_resp_uc_write_ready_o), - .mem_resp_uc_write_valid_i(dcache_mem_resp_uc_write_valid_i), - .mem_resp_uc_write_i (dcache_mem_resp_uc_write_i), + .mem_resp_write_ready_o(dcache_mem_resp_write_ready_o), + .mem_resp_write_valid_i(dcache_mem_resp_write_valid_i), + .mem_resp_write_i (dcache_mem_resp_write_i), .evt_cache_write_miss_o(dcache_write_miss), .evt_cache_read_miss_o (dcache_read_miss), @@ -461,7 +408,8 @@ module cva6_hpdcache_wrapper .cfg_wbuf_inhibit_write_coalescing_i(1'b0), .cfg_prefetch_updt_plru_i (1'b1), .cfg_error_on_cacheable_amo_i (1'b0), - .cfg_rtab_single_entry_i (1'b0) + .cfg_rtab_single_entry_i (1'b0), + .cfg_default_wb_i (1'b0) ); assign dcache_miss_o = dcache_read_miss, wbuffer_not_ni_o = wbuffer_empty_o; diff --git a/core/cache_subsystem/hpdcache b/core/cache_subsystem/hpdcache index 25ffa3438c..edd501cc74 160000 --- a/core/cache_subsystem/hpdcache +++ b/core/cache_subsystem/hpdcache @@ -1 +1 @@ -Subproject commit 25ffa3438c8150fef791dd165234694a51e3c529 +Subproject commit edd501cc7424ad63d2187feacadc942650ec14af diff --git a/core/cache_subsystem/std_cache_subsystem.sv b/core/cache_subsystem/std_cache_subsystem.sv index 33c62e5cec..7adb935c27 100644 --- a/core/cache_subsystem/std_cache_subsystem.sv +++ b/core/cache_subsystem/std_cache_subsystem.sv @@ -135,6 +135,8 @@ module std_cache_subsystem logic [1:0] w_select, w_select_fifo, w_select_arbiter; logic [1:0] w_fifo_usage; logic w_fifo_empty, w_fifo_full; + logic w_fifo_push, w_fifo_pop; + logic aw_lock_q, aw_lock_d; // AR Channel @@ -191,17 +193,26 @@ module std_cache_subsystem .flush_i (1'b0), .testmode_i(1'b0), .full_o (w_fifo_full), - .empty_o (), // leave open + .empty_o (), // leave open .usage_o (w_fifo_usage), .data_i (w_select), // a new transaction was requested and granted - .push_i (axi_req_o.aw_valid & axi_resp_i.aw_ready), + .push_i (w_fifo_push), // write ID to select the output MUX .data_o (w_select_fifo), // transaction has finished - .pop_i (axi_req_o.w_valid & axi_resp_i.w_ready & axi_req_o.w.last) + .pop_i (w_fifo_pop) ); + always_ff @(posedge clk_i or negedge rst_ni) begin : aw_lock_reg + if (~rst_ni) aw_lock_q <= 1'b0; + else aw_lock_q <= aw_lock_d; + end + + assign w_fifo_push = ~aw_lock_q & axi_req_o.aw_valid; + assign w_fifo_pop = axi_req_o.w_valid & axi_resp_i.w_ready & axi_req_o.w.last; + assign aw_lock_d = ~axi_resp_i.aw_ready & (axi_req_o.aw_valid | aw_lock_q); + // In fall-through mode, the empty_o will be low when push_i is high (on zero usage). // We do not want this here. Also, usage_o is missing the MSB, so on full fifo, usage_o is zero. assign w_fifo_empty = w_fifo_usage == 0 && !w_fifo_full; diff --git a/core/cache_subsystem/tag_cmp.sv b/core/cache_subsystem/tag_cmp.sv index ff7b805fdf..38d1ce1145 100644 --- a/core/cache_subsystem/tag_cmp.sv +++ b/core/cache_subsystem/tag_cmp.sv @@ -81,18 +81,19 @@ module tag_cmp #( if (req_i[i]) break; end + end + `ifndef SYNTHESIS `ifndef VERILATOR - // assert that cache only hits on one way - // this only needs to be checked one cycle after all ways have been requested - onehot : - assert property (@(posedge clk_i) disable iff (!rst_ni) &req_i |=> $onehot0(hit_way_o)) - else begin - $fatal(1, "Hit should be one-hot encoded"); - end + // assert that cache only hits on one way + // this only needs to be checked one cycle after all ways have been requested + onehot : + assert property (@(posedge clk_i) disable iff (!rst_ni) &req_i |=> $onehot0(hit_way_o)) + else begin + $fatal(1, "Hit should be one-hot encoded"); + end `endif `endif - end always_ff @(posedge clk_i or negedge rst_ni) begin if (~rst_ni) begin diff --git a/core/cache_subsystem/wt_axi_adapter.sv b/core/cache_subsystem/wt_axi_adapter.sv index 14dade2b58..6c5fe585f8 100644 --- a/core/cache_subsystem/wt_axi_adapter.sv +++ b/core/cache_subsystem/wt_axi_adapter.sv @@ -311,8 +311,9 @@ module wt_axi_adapter end cva6_fifo_v3 #( - .dtype (icache_req_t), - .DEPTH (ReqFifoDepth), + .FPGA_ALTERA(CVA6Cfg.FpgaAlteraEn), + .dtype(icache_req_t), + .DEPTH(ReqFifoDepth), .FPGA_EN(CVA6Cfg.FpgaEn) ) i_icache_data_fifo ( .clk_i (clk_i), @@ -329,8 +330,9 @@ module wt_axi_adapter ); cva6_fifo_v3 #( - .dtype (dcache_req_t), - .DEPTH (ReqFifoDepth), + .FPGA_ALTERA(CVA6Cfg.FpgaAlteraEn), + .dtype(dcache_req_t), + .DEPTH(ReqFifoDepth), .FPGA_EN(CVA6Cfg.FpgaEn) ) i_dcache_data_fifo ( .clk_i (clk_i), diff --git a/core/commit_stage.sv b/core/commit_stage.sv index 129422e62d..8984b7d626 100644 --- a/core/commit_stage.sv +++ b/core/commit_stage.sv @@ -60,10 +60,10 @@ module commit_stage output logic [CVA6Cfg.XLEN-1:0] csr_wdata_o, // Data to read from CSR - CSR_REGFILE input logic [CVA6Cfg.XLEN-1:0] csr_rdata_i, - // Exception or interrupt occurred in CSR stage (the same as commit) - CSR_REGFILE - input exception_t csr_exception_i, // Write the fflags CSR - CSR_REGFILE output logic csr_write_fflags_o, + // Exception or interrupt occurred in CSR stage (the same as commit) - CSR_REGFILE + input exception_t csr_exception_i, // Commit the pending store - EX_STAGE output logic commit_lsu_o, // Commit buffer of LSU is ready - EX_STAGE @@ -84,7 +84,9 @@ module commit_stage output logic flush_commit_o, // Flush TLBs and pipeline - CONTROLLER output logic sfence_vma_o, + // TO_BE_COMPLETED - CONTROLLER output logic hfence_vvma_o, + // TO_BE_COMPLETED - CONTROLLER output logic hfence_gvma_o ); @@ -103,7 +105,7 @@ module commit_stage // ); for (genvar i = 0; i < CVA6Cfg.NrCommitPorts; i++) begin : gen_waddr - assign waddr_o[i] = commit_instr_i[i].rd[4:0]; + assign waddr_o[i] = commit_instr_i[i].rd; end assign pc_o = commit_instr_i[0].pc; @@ -314,14 +316,13 @@ module commit_stage && !halt_i && !(commit_instr_i[0].fu inside {CSR}) && !flush_dcache_i - && !instr_0_is_amo + && !(CVA6Cfg.RVA && instr_0_is_amo) && !single_step_i) begin // only if the first instruction didn't throw an exception and this instruction won't throw an exception // and the functional unit is of type ALU, LOAD, CTRL_FLOW, MULT, FPU or FPU_VEC - if (!exception_o.valid && !commit_instr_i[1].ex.valid - && (commit_instr_i[1].fu inside {ALU, LOAD, CTRL_FLOW, MULT, FPU, FPU_VEC})) begin + if (!commit_instr_i[1].ex.valid && (commit_instr_i[1].fu inside {ALU, LOAD, CTRL_FLOW, MULT, FPU, FPU_VEC})) begin - if (commit_instr_i[1].is_macro_instr && commit_instr_i[1].is_last_macro_instr) + if (CVA6Cfg.RVZCMP && commit_instr_i[1].is_macro_instr && commit_instr_i[1].is_last_macro_instr) commit_macro_ack[1] = 1'b1; else commit_macro_ack[1] = 1'b0; @@ -334,15 +335,16 @@ module commit_stage // additionally check if we are retiring an FPU instruction because we need to make sure that we write all // exception flags - if (CVA6Cfg.FpPresent && commit_instr_i[1].fu inside {FPU, FPU_VEC}) begin - if (csr_write_fflags_o) - csr_wdata_o = { - {CVA6Cfg.XLEN - 5{1'b0}}, - (commit_instr_i[0].ex.cause[4:0] | commit_instr_i[1].ex.cause[4:0]) - }; - else csr_wdata_o = {{CVA6Cfg.XLEN - 5{1'b0}}, commit_instr_i[1].ex.cause[4:0]}; - - csr_write_fflags_o = 1'b1; + if (CVA6Cfg.FpPresent) begin + if (commit_instr_i[1].fu inside {FPU, FPU_VEC}) begin + if (csr_write_fflags_o) + csr_wdata_o = { + {CVA6Cfg.XLEN - 5{1'b0}}, + (commit_instr_i[0].ex.cause[4:0] | commit_instr_i[1].ex.cause[4:0]) + }; + else csr_wdata_o = {{CVA6Cfg.XLEN - 5{1'b0}}, commit_instr_i[1].ex.cause[4:0]}; + csr_write_fflags_o = 1'b1; + end end end end diff --git a/core/compressed_decoder.sv b/core/compressed_decoder.sv index 2c60df65f1..1f09fd1f29 100644 --- a/core/compressed_decoder.sv +++ b/core/compressed_decoder.sv @@ -39,7 +39,6 @@ module compressed_decoder #( // ------------------- always_comb begin illegal_instr_o = 1'b0; - instr_o = '0; is_compressed_o = 1'b1; instr_o = instr_i; is_macro_instr_o = 0; diff --git a/core/controller.sv b/core/controller.sv index a5edcf131a..ebd0e0c75d 100644 --- a/core/controller.sv +++ b/core/controller.sv @@ -45,7 +45,9 @@ module controller input logic flush_dcache_ack_i, // Flush TLBs - EX_STAGE output logic flush_tlb_o, + // TO_BE_COMPLETED - TO_BE_COMPLETED output logic flush_tlb_vvma_o, + // TO_BE_COMPLETED - TO_BE_COMPLETED output logic flush_tlb_gvma_o, // Halt request from CSR (WFI instruction) - CSR_REGFILE input logic halt_csr_i, @@ -69,7 +71,9 @@ module controller input logic fence_i, // We got an instruction to flush the TLBs and pipeline - COMMIT_STAGE input logic sfence_vma_i, + // TO_BE_COMPLETED - TO_BE_COMPLETED input logic hfence_vvma_i, + // TO_BE_COMPLETED - TO_BE_COMPLETED input logic hfence_gvma_i, // Flush request from commit stage - COMMIT_STAGE input logic flush_commit_i, diff --git a/core/csr_regfile.sv b/core/csr_regfile.sv index 8dbfe038f3..296a8d7916 100644 --- a/core/csr_regfile.sv +++ b/core/csr_regfile.sv @@ -42,7 +42,6 @@ module csr_regfile input logic [CVA6Cfg.VLEN-1:0] boot_addr_i, // Hart id in a multicore environment (reflected in a CSR) - SUBSYSTEM input logic [CVA6Cfg.XLEN-1:0] hart_id_i, - // we are taking an exception // We've got an exception from the commit stage, take it - COMMIT_STAGE input exception_t ex_i, // Operation to perform on the CSR file - COMMIT_STAGE @@ -153,7 +152,6 @@ module csr_regfile output logic dcache_en_o, // Accelerator memory consistent mode - ACC_DISPATCHER output logic acc_cons_en_o, - // Performance Counter // read/write address to performance counter module - PERF_COUNTERS output logic [11:0] perf_addr_o, // write data to performance counter module - PERF_COUNTERS @@ -163,9 +161,9 @@ module csr_regfile // TO_BE_COMPLETED - PERF_COUNTERS output logic perf_we_o, // PMP configuration containing pmpcfg for max 64 PMPs - ACC_DISPATCHER - output riscv::pmpcfg_t [CVA6Cfg.NrPMPEntries:0] pmpcfg_o, + output riscv::pmpcfg_t [(CVA6Cfg.NrPMPEntries > 0 ? CVA6Cfg.NrPMPEntries-1 : 0):0] pmpcfg_o, // PMP addresses - ACC_DISPATCHER - output logic [CVA6Cfg.NrPMPEntries:0][CVA6Cfg.PLEN-3:0] pmpaddr_o, + output logic [(CVA6Cfg.NrPMPEntries > 0 ? CVA6Cfg.NrPMPEntries-1 : 0):0][CVA6Cfg.PLEN-3:0] pmpaddr_o, // TO_BE_COMPLETED - PERF_COUNTERS output logic [31:0] mcountinhibit_o, // RVFI @@ -293,8 +291,8 @@ module csr_regfile | (CVA6Cfg.XLEN'(CVA6Cfg.NSX) << 23) // X - Non-standard extensions present | ((CVA6Cfg.XLEN == 64 ? 2 : 1) << CVA6Cfg.XLEN - 2); // MXL - assign pmpcfg_o = pmpcfg_q[CVA6Cfg.NrPMPEntries:0]; - assign pmpaddr_o = pmpaddr_q[CVA6Cfg.NrPMPEntries:0]; + assign pmpcfg_o = pmpcfg_q[(CVA6Cfg.NrPMPEntries>0?CVA6Cfg.NrPMPEntries-1 : 0):0]; + assign pmpaddr_o = pmpaddr_q[(CVA6Cfg.NrPMPEntries>0?CVA6Cfg.NrPMPEntries-1 : 0):0]; riscv::fcsr_t fcsr_q, fcsr_d; // ---------------- @@ -553,9 +551,7 @@ module csr_regfile riscv::CSR_MHARTID: csr_rdata = hart_id_i; riscv::CSR_MCONFIGPTR: csr_rdata = '0; // not implemented riscv::CSR_MCOUNTINHIBIT: - if (CVA6Cfg.PerfCounterEn) - csr_rdata = {{(CVA6Cfg.XLEN - (MHPMCounterNum + 3)) {1'b0}}, mcountinhibit_q}; - else read_access_exception = 1'b1; + csr_rdata = {{(CVA6Cfg.XLEN - (MHPMCounterNum + 3)) {1'b0}}, mcountinhibit_q}; // Counters and Timers riscv::CSR_MCYCLE: csr_rdata = cycle_q[CVA6Cfg.XLEN-1:0]; riscv::CSR_MCYCLEH: @@ -856,8 +852,8 @@ module csr_regfile // -> last bit of pmpaddr must be set 0/1 based on the mode: // NA4, NAPOT: 1 // TOR, OFF: 0 - if (pmpcfg_q[index].addr_mode[1] == 1'b1 || pmpcfg_q[index].addr_mode == 'h0) - csr_rdata = pmpaddr_q[index][CVA6Cfg.PLEN-3:0]; + if (pmpcfg_q[index].addr_mode[1] == 1'b1) + csr_rdata = {pmpaddr_q[index][CVA6Cfg.PLEN-3:1], 1'b1}; else csr_rdata = {pmpaddr_q[index][CVA6Cfg.PLEN-3:1], 1'b0}; end default: read_access_exception = 1'b1; @@ -1494,7 +1490,7 @@ module csr_regfile riscv::CSR_MCOUNTINHIBIT: if (CVA6Cfg.PerfCounterEn) mcountinhibit_d = {csr_wdata[MHPMCounterNum+2:2], 1'b0, csr_wdata[0]}; - else update_access_exception = 1'b1; + else mcountinhibit_d = '0; // performance counters riscv::CSR_MCYCLE: cycle_d[CVA6Cfg.XLEN-1:0] = csr_wdata; riscv::CSR_MCYCLEH: @@ -1833,25 +1829,23 @@ module csr_regfile // trap to supervisor mode if (CVA6Cfg.RVS && trap_to_priv_lvl == riscv::PRIV_LVL_S) begin - if (CVA6Cfg.RVH) begin - if (trap_to_v) begin - // update sstatus - vsstatus_d.sie = 1'b0; - vsstatus_d.spie = vsstatus_q.sie; - // this can either be user or supervisor mode - vsstatus_d.spp = priv_lvl_q[0]; - // set cause - vscause_d = ex_i.cause[CVA6Cfg.XLEN-1] ? {ex_i.cause[CVA6Cfg.XLEN-1:2], 2'b01} : ex_i.cause; - // set epc - vsepc_d = {{CVA6Cfg.XLEN - CVA6Cfg.VLEN{pc_i[CVA6Cfg.VLEN-1]}}, pc_i}; - // set vstval - vstval_d = (ariane_pkg::ZERO_TVAL - && (ex_i.cause inside { - riscv::ILLEGAL_INSTR, - riscv::BREAKPOINT, - riscv::ENV_CALL_UMODE - } || ex_i.cause[CVA6Cfg.XLEN-1])) ? '0 : ex_i.tval; - end + if (CVA6Cfg.RVH && trap_to_v) begin + // update sstatus + vsstatus_d.sie = 1'b0; + vsstatus_d.spie = (CVA6Cfg.RVH) ? vsstatus_q.sie : '0; + // this can either be user or supervisor mode + vsstatus_d.spp = priv_lvl_q[0]; + // set cause + vscause_d = ex_i.cause[CVA6Cfg.XLEN-1] ? {ex_i.cause[CVA6Cfg.XLEN-1:2], 2'b01} : ex_i.cause; + // set epc + vsepc_d = {{CVA6Cfg.XLEN - CVA6Cfg.VLEN{pc_i[CVA6Cfg.VLEN-1]}}, pc_i}; + // set vstval + vstval_d = (ariane_pkg::ZERO_TVAL + && (ex_i.cause inside { + riscv::ILLEGAL_INSTR, + riscv::BREAKPOINT, + riscv::ENV_CALL_UMODE + } || ex_i.cause[CVA6Cfg.XLEN-1])) ? '0 : ex_i.tval; end else begin // update sstatus mstatus_d.sie = 1'b0; @@ -2247,7 +2241,7 @@ module csr_regfile // precedence over interrupts if (csr_op_i inside {CSR_WRITE, CSR_SET, CSR_CLEAR, CSR_READ}) begin if (access_priv < csr_addr.csr_decode.priv_lvl) begin - if (v_q && csr_addr.csr_decode.priv_lvl == riscv::PRIV_LVL_HS) + if (v_q && csr_addr.csr_decode.priv_lvl <= riscv::PRIV_LVL_HS) virtual_privilege_violation = 1'b1; else privilege_violation = 1'b1; end @@ -2302,7 +2296,7 @@ module csr_regfile // if we are reading or writing, check for the correct privilege level this has // precedence over interrupts if (csr_op_i inside {CSR_WRITE, CSR_SET, CSR_CLEAR, CSR_READ}) begin - if ((riscv::priv_lvl_t'(priv_lvl_o & csr_addr.csr_decode.priv_lvl) != csr_addr.csr_decode.priv_lvl)) begin + if (CVA6Cfg.RVU && (riscv::priv_lvl_t'(priv_lvl_o & csr_addr.csr_decode.priv_lvl) != csr_addr.csr_decode.priv_lvl)) begin privilege_violation = 1'b1; end // check access to debug mode only CSRs @@ -2415,8 +2409,10 @@ module csr_regfile epc_o = mepc_q[CVA6Cfg.VLEN-1:0]; // we are returning from supervisor or virtual supervisor mode, so take the sepc register - if (CVA6Cfg.RVS && sret) begin - epc_o = (CVA6Cfg.RVH && v_q) ? vsepc_q[CVA6Cfg.VLEN-1:0] : sepc_q[CVA6Cfg.VLEN-1:0]; + if (CVA6Cfg.RVS) begin + if (sret) begin + epc_o = (CVA6Cfg.RVH && v_q) ? vsepc_q[CVA6Cfg.VLEN-1:0] : sepc_q[CVA6Cfg.VLEN-1:0]; + end end // we are returning from debug mode, to take the dpc register if (CVA6Cfg.DebugEn) begin @@ -2457,10 +2453,14 @@ module csr_regfile assign frm_o = fcsr_q.frm; assign fprec_o = fcsr_q.fprec; // MMU outputs - assign satp_ppn_o = satp_q.ppn; + assign satp_ppn_o = CVA6Cfg.RVS ? satp_q.ppn : '0; assign vsatp_ppn_o = CVA6Cfg.RVH ? vsatp_q.ppn : '0; assign hgatp_ppn_o = CVA6Cfg.RVH ? hgatp_q.ppn : '0; - assign asid_o = satp_q.asid[CVA6Cfg.ASID_WIDTH-1:0]; + if (CVA6Cfg.RVS) begin + assign asid_o = satp_q.asid[CVA6Cfg.ASID_WIDTH-1:0]; + end else begin + assign asid_o = '0; + end assign vs_asid_o = CVA6Cfg.RVH ? vsatp_q.asid[CVA6Cfg.ASID_WIDTH-1:0] : '0; assign vmid_o = CVA6Cfg.RVH ? hgatp_q.vmid[CVA6Cfg.VMID_WIDTH-1:0] : '0; assign sum_o = mstatus_q.sum; @@ -2483,12 +2483,20 @@ module csr_regfile : 1'b0; assign en_g_translation_o = 1'b0; end - assign mxr_o = mstatus_q.mxr; + assign mxr_o = mstatus_q.mxr; assign vmxr_o = CVA6Cfg.RVH ? vsstatus_q.mxr : '0; - assign tvm_o = (CVA6Cfg.RVH && v_q) ? hstatus_q.vtvm : mstatus_q.tvm; - assign tw_o = mstatus_q.tw; + if (CVA6Cfg.RVH) begin + assign tvm_o = (v_q) ? hstatus_q.vtvm : mstatus_q.tvm; + end else begin + assign tvm_o = mstatus_q.tvm; + end + assign tw_o = mstatus_q.tw; assign vtw_o = CVA6Cfg.RVH ? hstatus_q.vtw : '0; - assign tsr_o = (CVA6Cfg.RVH && v_q) ? hstatus_q.vtsr : mstatus_q.tsr; + if (CVA6Cfg.RVH) begin + assign tsr_o = (v_q) ? hstatus_q.vtsr : mstatus_q.tsr; + end else begin + assign tsr_o = mstatus_q.tsr; + end assign halt_csr_o = wfi_q; `ifdef PITON_ARIANE assign icache_en_o = icache_q[0]; @@ -2501,7 +2509,7 @@ module csr_regfile // determine if mprv needs to be considered if in debug mode assign mprv = (CVA6Cfg.DebugEn && debug_mode_q && !dcsr_q.mprven) ? 1'b0 : mstatus_q.mprv; assign debug_mode_o = debug_mode_q; - assign single_step_o = dcsr_q.step; + assign single_step_o = CVA6Cfg.DebugEn ? dcsr_q.step : 1'b0; assign mcountinhibit_o = {{29 - MHPMCounterNum{1'b0}}, mcountinhibit_q}; // sequential process diff --git a/core/cva6.sv b/core/cva6.sv index a0c6b2e845..4c6b3bf7d6 100644 --- a/core/cva6.sv +++ b/core/cva6.sv @@ -13,6 +13,7 @@ // Description: CVA6 Top-level module `include "rvfi_types.svh" +`include "cvxif_types.svh" module cva6 import ariane_pkg::*; @@ -26,7 +27,7 @@ module cva6 parameter type rvfi_probes_instr_t = `RVFI_PROBES_INSTR_T(CVA6Cfg), parameter type rvfi_probes_csr_t = `RVFI_PROBES_CSR_T(CVA6Cfg), parameter type rvfi_probes_t = struct packed { - logic csr; + rvfi_probes_csr_t csr; rvfi_probes_instr_t instr; }, @@ -113,6 +114,12 @@ module cva6 logic is_double_rd_macro_instr; // is double move decoded 32bit instruction of macro definition logic vfp; // is this a vector floating-point instruction? }, + localparam type writeback_t = struct packed { + logic valid; // wb data is valid + logic [CVA6Cfg.XLEN-1:0] data; //wb data + logic ex_valid; // exception from WB + logic [CVA6Cfg.TRANS_ID_BITS-1:0] trans_id; //transaction ID + }, // branch-predict // this is the struct we get back from ex stage and we will use it to update @@ -272,8 +279,22 @@ module cva6 // parameter type acc_cfg_t = logic, parameter acc_cfg_t AccCfg = '0, - parameter type cvxif_req_t = cvxif_pkg::cvxif_req_t, - parameter type cvxif_resp_t = cvxif_pkg::cvxif_resp_t + // CVXIF Types + parameter type readregflags_t = `READREGFLAGS_T(CVA6Cfg), + parameter type writeregflags_t = `WRITEREGFLAGS_T(CVA6Cfg), + parameter type id_t = `ID_T(CVA6Cfg), + parameter type hartid_t = `HARTID_T(CVA6Cfg), + parameter type x_compressed_req_t = `X_COMPRESSED_REQ_T(CVA6Cfg, hartid_t), + parameter type x_compressed_resp_t = `X_COMPRESSED_RESP_T(CVA6Cfg), + parameter type x_issue_req_t = `X_ISSUE_REQ_T(CVA6Cfg, hartit_t, id_t), + parameter type x_issue_resp_t = `X_ISSUE_RESP_T(CVA6Cfg, writeregflags_t, readregflags_t), + parameter type x_register_t = `X_REGISTER_T(CVA6Cfg, hartid_t, id_t, readregflags_t), + parameter type x_commit_t = `X_COMMIT_T(CVA6Cfg, hartid_t, id_t), + parameter type x_result_t = `X_RESULT_T(CVA6Cfg, hartid_t, id_t, writeregflags_t), + parameter type cvxif_req_t = + `CVXIF_REQ_T(CVA6Cfg, x_compressed_req_t, x_issue_req_t, x_register_req_t, x_commit_t), + parameter type cvxif_resp_t = + `CVXIF_RESP_T(CVA6Cfg, x_compressed_resp_t, x_issue_resp_t, x_result_t) ) ( // Subsystem Clock - SUBSYSTEM input logic clk_i, @@ -343,8 +364,27 @@ module cva6 logic [CVA6Cfg.NrCommitPorts-1:0] commit_macro_ack; localparam NumPorts = 4; - cvxif_pkg::cvxif_req_t cvxif_req; - cvxif_pkg::cvxif_resp_t cvxif_resp; + + // CVXIF + cvxif_req_t cvxif_req; + // CVXIF OUTPUTS + logic x_compressed_valid; + x_compressed_req_t x_compressed_req; + logic x_issue_valid; + x_issue_req_t x_issue_req; + logic x_register_valid; + x_register_t x_register; + logic x_commit_valid; + x_commit_t x_commit; + logic x_result_ready; + // CVXIF INPUTS + logic x_compressed_ready; + x_compressed_resp_t x_compressed_resp; + logic x_issue_ready; + x_issue_resp_t x_issue_resp; + logic x_register_ready; + logic x_result_valid; + x_result_t x_result; // -------------- // PCGEN <-> CSR @@ -437,9 +477,11 @@ module cva6 logic x_valid_ex_id; exception_t x_exception_ex_id; logic x_we_ex_id; + logic [4:0] x_rd_ex_id; logic [CVA6Cfg.NrIssuePorts-1:0] x_issue_valid_id_ex; logic x_issue_ready_ex_id; logic [31:0] x_off_instr_id_ex; + logic x_transaction_rejected; // -------------- // EX <-> COMMIT // -------------- @@ -518,8 +560,8 @@ module cva6 logic acc_cons_en_csr; logic debug_mode; logic single_step_csr_commit; - riscv::pmpcfg_t [CVA6Cfg.NrPMPEntries:0] pmpcfg; - logic [CVA6Cfg.NrPMPEntries:0][CVA6Cfg.PLEN-3:0] pmpaddr; + riscv::pmpcfg_t [CVA6Cfg.NrPMPEntries-1:0] pmpcfg; + logic [CVA6Cfg.NrPMPEntries-1:0][CVA6Cfg.PLEN-3:0] pmpaddr; logic [31:0] mcountinhibit_csr_perf; // ---------------------------- // Performance Counters <-> * @@ -601,25 +643,26 @@ module cva6 .icache_dreq_t(icache_dreq_t), .icache_drsp_t(icache_drsp_t) ) i_frontend ( - .flush_i (flush_ctrl_if), // not entirely correct + .clk_i, + .rst_ni, + .boot_addr_i (boot_addr_i[CVA6Cfg.VLEN-1:0]), .flush_bp_i (1'b0), + .flush_i (flush_ctrl_if), // not entirely correct .halt_i (halt_ctrl), - .debug_mode_i (debug_mode), - .boot_addr_i (boot_addr_i[CVA6Cfg.VLEN-1:0]), - .icache_dreq_i (icache_dreq_cache_if), - .icache_dreq_o (icache_dreq_if_cache), - .resolved_branch_i (resolved_branch), - .pc_commit_i (pc_commit), .set_pc_commit_i (set_pc_ctrl_pcgen), - .set_debug_pc_i (set_debug_pc), - .epc_i (epc_commit_pcgen), + .pc_commit_i (pc_commit), + .ex_valid_i (ex_commit.valid), + .resolved_branch_i (resolved_branch), .eret_i (eret), + .epc_i (epc_commit_pcgen), .trap_vector_base_i (trap_vector_base_commit_pcgen), - .ex_valid_i (ex_commit.valid), + .set_debug_pc_i (set_debug_pc), + .debug_mode_i (debug_mode), + .icache_dreq_o (icache_dreq_if_cache), + .icache_dreq_i (icache_dreq_cache_if), .fetch_entry_o (fetch_entry_if_id), .fetch_entry_valid_o(fetch_valid_if_id), - .fetch_entry_ready_i(fetch_ready_id_if), - .* + .fetch_entry_ready_i(fetch_ready_id_if) ); // --------- @@ -633,7 +676,9 @@ module cva6 .irq_ctrl_t(irq_ctrl_t), .scoreboard_entry_t(scoreboard_entry_t), .interrupts_t(interrupts_t), - .INTERRUPTS(INTERRUPTS) + .INTERRUPTS(INTERRUPTS), + .x_compressed_req_t(x_compressed_req_t), + .x_compressed_resp_t(x_compressed_resp_t) ) id_stage_i ( .clk_i, .rst_ni, @@ -652,20 +697,25 @@ module cva6 .rvfi_is_compressed_o(rvfi_is_compressed), - .priv_lvl_i (priv_lvl), - .v_i (v), - .fs_i (fs), - .vfs_i (vfs), - .frm_i (frm_csr_id_issue_ex), - .vs_i (vs), - .irq_i (irq_i), - .irq_ctrl_i (irq_ctrl_csr_id), - .debug_mode_i(debug_mode), - .tvm_i (tvm_csr_id), - .tw_i (tw_csr_id), - .vtw_i (vtw_csr_id), - .tsr_i (tsr_csr_id), - .hu_i (hu) + .priv_lvl_i (priv_lvl), + .v_i (v), + .fs_i (fs), + .vfs_i (vfs), + .frm_i (frm_csr_id_issue_ex), + .vs_i (vs), + .irq_i (irq_i), + .irq_ctrl_i (irq_ctrl_csr_id), + .debug_mode_i (debug_mode), + .tvm_i (tvm_csr_id), + .tw_i (tw_csr_id), + .vtw_i (vtw_csr_id), + .tsr_i (tsr_csr_id), + .hu_i (hu), + .hart_id_i (hart_id_i), + .compressed_ready_i(x_compressed_ready), + .compressed_resp_i (x_compressed_resp), + .compressed_valid_o(x_compressed_valid), + .compressed_req_o (x_compressed_req) ); logic [CVA6Cfg.NrWbPorts-1:0][CVA6Cfg.TRANS_ID_BITS-1:0] trans_id_ex_id; @@ -693,16 +743,39 @@ module cva6 assign ex_ex_ex_id[FPU_WB] = fpu_exception_ex_id; assign wt_valid_ex_id[FPU_WB] = fpu_valid_ex_id; + always_comb begin : gen_cvxif_input_assignement + x_compressed_ready = cvxif_resp_i.compressed_ready; + x_compressed_resp = cvxif_resp_i.compressed_resp; + x_issue_ready = cvxif_resp_i.issue_ready; + x_issue_resp = cvxif_resp_i.issue_resp; + x_register_ready = cvxif_resp_i.register_ready; + x_result_valid = cvxif_resp_i.result_valid; + x_result = cvxif_resp_i.result; + end if (CVA6Cfg.CvxifEn) begin + always_comb begin : gen_cvxif_output_assignement + cvxif_req.compressed_valid = x_compressed_valid; + cvxif_req.compressed_req = x_compressed_req; + cvxif_req.issue_valid = x_issue_valid; + cvxif_req.issue_req = x_issue_req; + cvxif_req.register_valid = x_register_valid; + cvxif_req.register = x_register; + cvxif_req.commit_valid = x_commit_valid; + cvxif_req.commit = x_commit; + cvxif_req.result_ready = x_result_ready; + end assign trans_id_ex_id[X_WB] = x_trans_id_ex_id; assign wbdata_ex_id[X_WB] = x_result_ex_id; assign ex_ex_ex_id[X_WB] = x_exception_ex_id; assign wt_valid_ex_id[X_WB] = x_valid_ex_id; end else if (CVA6Cfg.EnableAccelerator) begin + assign cvxif_req = '0; assign trans_id_ex_id[ACC_WB] = acc_trans_id_ex_id; assign wbdata_ex_id[ACC_WB] = acc_result_ex_id; assign ex_ex_ex_id[ACC_WB] = acc_exception_ex_id; assign wt_valid_ex_id[ACC_WB] = acc_valid_ex_id; + end else begin + assign cvxif_req = '0; end if (CVA6Cfg.CvxifEn && CVA6Cfg.EnableAccelerator) begin : gen_err_xif_and_acc @@ -718,63 +791,80 @@ module cva6 .branchpredict_sbe_t(branchpredict_sbe_t), .exception_t(exception_t), .fu_data_t(fu_data_t), - .scoreboard_entry_t(scoreboard_entry_t) + .scoreboard_entry_t(scoreboard_entry_t), + .writeback_t(writeback_t), + .x_issue_req_t(x_issue_req_t), + .x_issue_resp_t(x_issue_resp_t), + .x_register_t(x_register_t), + .x_commit_t(x_commit_t) ) issue_stage_i ( .clk_i, .rst_ni, - .sb_full_o (sb_full), - .flush_unissued_instr_i(flush_unissued_instr_ctrl_id), - .flush_i (flush_ctrl_id), - .stall_i (stall_acc_id), + .sb_full_o (sb_full), + .flush_unissued_instr_i (flush_unissued_instr_ctrl_id), + .flush_i (flush_ctrl_id), + .stall_i (stall_acc_id), // ID Stage - .decoded_instr_i (issue_entry_id_issue), - .orig_instr_i (orig_instr_id_issue), - .decoded_instr_valid_i (issue_entry_valid_id_issue), - .is_ctrl_flow_i (is_ctrl_fow_id_issue), - .decoded_instr_ack_o (issue_instr_issue_id), + .decoded_instr_i (issue_entry_id_issue), + .orig_instr_i (orig_instr_id_issue), + .decoded_instr_valid_i (issue_entry_valid_id_issue), + .is_ctrl_flow_i (is_ctrl_fow_id_issue), + .decoded_instr_ack_o (issue_instr_issue_id), // Functional Units - .rs1_forwarding_o (rs1_forwarding_id_ex), - .rs2_forwarding_o (rs2_forwarding_id_ex), - .fu_data_o (fu_data_id_ex), - .pc_o (pc_id_ex), - .is_compressed_instr_o (is_compressed_instr_id_ex), - .tinst_o (tinst_ex), + .rs1_forwarding_o (rs1_forwarding_id_ex), + .rs2_forwarding_o (rs2_forwarding_id_ex), + .fu_data_o (fu_data_id_ex), + .pc_o (pc_id_ex), + .is_compressed_instr_o (is_compressed_instr_id_ex), + .tinst_o (tinst_ex), // fixed latency unit ready - .flu_ready_i (flu_ready_ex_id), + .flu_ready_i (flu_ready_ex_id), // ALU - .alu_valid_o (alu_valid_id_ex), + .alu_valid_o (alu_valid_id_ex), // Branches and Jumps - .branch_valid_o (branch_valid_id_ex), // branch is valid - .branch_predict_o (branch_predict_id_ex), // branch predict to ex - .resolve_branch_i (resolve_branch_ex_id), // in order to resolve the branch + .branch_valid_o (branch_valid_id_ex), // branch is valid + .branch_predict_o (branch_predict_id_ex), // branch predict to ex + .resolve_branch_i (resolve_branch_ex_id), // in order to resolve the branch // LSU - .lsu_ready_i (lsu_ready_ex_id), - .lsu_valid_o (lsu_valid_id_ex), + .lsu_ready_i (lsu_ready_ex_id), + .lsu_valid_o (lsu_valid_id_ex), // Multiplier - .mult_valid_o (mult_valid_id_ex), + .mult_valid_o (mult_valid_id_ex), // FPU - .fpu_ready_i (fpu_ready_ex_id), - .fpu_valid_o (fpu_valid_id_ex), - .fpu_fmt_o (fpu_fmt_id_ex), - .fpu_rm_o (fpu_rm_id_ex), + .fpu_ready_i (fpu_ready_ex_id), + .fpu_valid_o (fpu_valid_id_ex), + .fpu_fmt_o (fpu_fmt_id_ex), + .fpu_rm_o (fpu_rm_id_ex), // ALU2 - .alu2_valid_o (alu2_valid_id_ex), + .alu2_valid_o (alu2_valid_id_ex), // CSR - .csr_valid_o (csr_valid_id_ex), + .csr_valid_o (csr_valid_id_ex), // CVXIF - .x_issue_valid_o (x_issue_valid_id_ex), - .x_issue_ready_i (x_issue_ready_ex_id), - .x_off_instr_o (x_off_instr_id_ex), + .xfu_valid_o (x_issue_valid_id_ex), + .xfu_ready_i (x_issue_ready_ex_id), + .x_off_instr_o (x_off_instr_id_ex), + .hart_id_i (hart_id_i), + .x_issue_ready_i (x_issue_ready), + .x_issue_resp_i (x_issue_resp), + .x_issue_valid_o (x_issue_valid), + .x_issue_req_o (x_issue_req), + .x_register_ready_i (x_register_ready), + .x_register_valid_o (x_register_valid), + .x_register_o (x_register), + .x_commit_valid_o (x_commit_valid), + .x_commit_o (x_commit), + .x_transaction_rejected_o(x_transaction_rejected), // Accelerator - .issue_instr_o (issue_instr_id_acc), - .issue_instr_hs_o (issue_instr_hs_id_acc), + .issue_instr_o (issue_instr_id_acc), + .issue_instr_hs_o (issue_instr_hs_id_acc), // Commit - .resolved_branch_i (resolved_branch), - .trans_id_i (trans_id_ex_id), - .wbdata_i (wbdata_ex_id), - .ex_ex_i (ex_ex_ex_id), - .wt_valid_i (wt_valid_ex_id), - .x_we_i (x_we_ex_id), + .trans_id_i (trans_id_ex_id), + .resolved_branch_i (resolved_branch), + .wbdata_i (wbdata_ex_id), + .ex_ex_i (ex_ex_ex_id), + .wt_valid_i (wt_valid_ex_id), + .x_we_i (x_we_ex_id), + .x_rd_i (x_rd_ex_id), .waddr_i (waddr_commit_id), .wdata_i (wdata_commit_id), @@ -787,8 +877,7 @@ module cva6 .stall_issue_o (stall_issue), //RVFI .rvfi_issue_pointer_o (rvfi_issue_pointer), - .rvfi_commit_pointer_o(rvfi_commit_pointer), - .* + .rvfi_commit_pointer_o(rvfi_commit_pointer) ); // --------- @@ -806,7 +895,8 @@ module cva6 .icache_arsp_t(icache_arsp_t), .icache_dreq_t(icache_dreq_t), .icache_drsp_t(icache_drsp_t), - .lsu_ctrl_t(lsu_ctrl_t) + .lsu_ctrl_t(lsu_ctrl_t), + .x_result_t(x_result_t) ) ex_stage_i ( .clk_i(clk_i), .rst_ni(rst_ni), @@ -877,13 +967,16 @@ module cva6 .x_valid_i (x_issue_valid_id_ex), .x_ready_o (x_issue_ready_ex_id), .x_off_instr_i (x_off_instr_id_ex), + .x_transaction_rejected_i(x_transaction_rejected), .x_trans_id_o (x_trans_id_ex_id), .x_exception_o (x_exception_ex_id), .x_result_o (x_result_ex_id), .x_valid_o (x_valid_ex_id), .x_we_o (x_we_ex_id), - .cvxif_req_o (cvxif_req), - .cvxif_resp_i (cvxif_resp), + .x_rd_o (x_rd_ex_id), + .x_result_valid_i (x_result_valid), + .x_result_i (x_result), + .x_result_ready_o (x_result_ready), // Accelerator .acc_valid_i (acc_valid_acc_ex), // Performance counters @@ -950,33 +1043,32 @@ module cva6 .commit_drop_i (commit_drop_id_commit), .commit_ack_o (commit_ack_commit_id), .commit_macro_ack_o(commit_macro_ack), - .no_st_pending_i (no_st_pending_commit), .waddr_o (waddr_commit_id), .wdata_o (wdata_commit_id), .we_gpr_o (we_gpr_commit_id), .we_fpr_o (we_fpr_commit_id), - .commit_lsu_o (lsu_commit_commit_ex), - .commit_lsu_ready_i(lsu_commit_ready_ex_commit), - .commit_tran_id_o (lsu_commit_trans_id), - .amo_valid_commit_o(amo_valid_commit), .amo_resp_i (amo_resp), - .commit_csr_o (csr_commit_commit_ex), .pc_o (pc_commit), .csr_op_o (csr_op_commit_csr), .csr_wdata_o (csr_wdata_commit_csr), .csr_rdata_i (csr_rdata_csr_commit), .csr_write_fflags_o(csr_write_fflags_commit_cs), .csr_exception_i (csr_exception_csr_commit), + .commit_lsu_o (lsu_commit_commit_ex), + .commit_lsu_ready_i(lsu_commit_ready_ex_commit), + .commit_tran_id_o (lsu_commit_trans_id), + .amo_valid_commit_o(amo_valid_commit), + .no_st_pending_i (no_st_pending_commit), + .commit_csr_o (csr_commit_commit_ex), .fence_i_o (fence_i_commit_controller), .fence_o (fence_commit_controller), + .flush_commit_o (flush_commit), .sfence_vma_o (sfence_vma_commit_controller), .hfence_vvma_o (hfence_vvma_commit_controller), - .hfence_gvma_o (hfence_gvma_commit_controller), - .flush_commit_o (flush_commit), - .* + .hfence_gvma_o (hfence_gvma_commit_controller) ); - assign commit_ack = commit_ack_commit_id & ~commit_drop_id_commit; + assign commit_ack = commit_macro_ack & ~commit_drop_id_commit; // --------- // CSR @@ -989,25 +1081,27 @@ module cva6 .rvfi_probes_csr_t (rvfi_probes_csr_t), .MHPMCounterNum (MHPMCounterNum) ) csr_regfile_i ( + .clk_i, + .rst_ni, + .time_irq_i, .flush_o (flush_csr_ctrl), .halt_csr_o (halt_csr_ctrl), .commit_instr_i (commit_instr_id_commit), - .commit_ack_i (commit_macro_ack), + .commit_ack_i (commit_ack), .boot_addr_i (boot_addr_i[CVA6Cfg.VLEN-1:0]), .hart_id_i (hart_id_i[CVA6Cfg.XLEN-1:0]), .ex_i (ex_commit), .csr_op_i (csr_op_commit_csr), - .csr_write_fflags_i (csr_write_fflags_commit_cs), - .dirty_fp_state_i (dirty_fp_state), - .dirty_v_state_i (dirty_v_state), .csr_addr_i (csr_addr_ex_csr), .csr_wdata_i (csr_wdata_commit_csr), .csr_rdata_o (csr_rdata_csr_commit), + .dirty_fp_state_i (dirty_fp_state), + .csr_write_fflags_i (csr_write_fflags_commit_cs), + .dirty_v_state_i (dirty_v_state), .pc_i (pc_commit), .csr_exception_o (csr_exception_csr_commit), .epc_o (epc_commit_pcgen), .eret_o (eret), - .set_debug_pc_o (set_debug_pc), .trap_vector_base_o (trap_vector_base_commit_pcgen), .priv_lvl_o (priv_lvl), .v_o (v), @@ -1020,13 +1114,13 @@ module cva6 .fprec_o (fprec_csr_ex), .vs_o (vs), .irq_ctrl_o (irq_ctrl_csr_id), - .ld_st_priv_lvl_o (ld_st_priv_lvl_csr_ex), - .ld_st_v_o (ld_st_v_csr_ex), - .csr_hs_ld_st_inst_i (csr_hs_ld_st_inst_ex), .en_translation_o (enable_translation_csr_ex), .en_g_translation_o (enable_g_translation_csr_ex), .en_ld_st_translation_o (en_ld_st_translation_csr_ex), .en_ld_st_g_translation_o(en_ld_st_g_translation_csr_ex), + .ld_st_priv_lvl_o (ld_st_priv_lvl_csr_ex), + .ld_st_v_o (ld_st_v_csr_ex), + .csr_hs_ld_st_inst_i (csr_hs_ld_st_inst_ex), .sum_o (sum_csr_ex), .vs_sum_o (vs_sum_csr_ex), .mxr_o (mxr_csr_ex), @@ -1037,6 +1131,10 @@ module cva6 .vs_asid_o (vs_asid_csr_ex), .hgatp_ppn_o (hgatp_ppn_csr_ex), .vmid_o (vmid_csr_ex), + .irq_i, + .ipi_i, + .debug_req_i, + .set_debug_pc_o (set_debug_pc), .tvm_o (tvm_csr_id), .tw_o (tw_csr_id), .vtw_o (vtw_csr_id), @@ -1044,8 +1142,8 @@ module cva6 .hu_o (hu), .debug_mode_o (debug_mode), .single_step_o (single_step_csr_commit), - .dcache_en_o (dcache_en_csr_nbdcache), .icache_en_o (icache_en_csr), + .dcache_en_o (dcache_en_csr_nbdcache), .acc_cons_en_o (acc_cons_en_csr), .perf_addr_o (addr_csr_perf), .perf_data_o (data_csr_perf), @@ -1055,12 +1153,7 @@ module cva6 .pmpaddr_o (pmpaddr), .mcountinhibit_o (mcountinhibit_csr_perf), //RVFI - .rvfi_csr_o (rvfi_csr), - .debug_req_i, - .ipi_i, - .irq_i, - .time_irq_i, - .* + .rvfi_csr_o (rvfi_csr) ); // ------------------------ @@ -1118,40 +1211,39 @@ module cva6 .CVA6Cfg(CVA6Cfg), .bp_resolve_t(bp_resolve_t) ) controller_i ( + .clk_i, + .rst_ni, // virtualization mode .v_i (v), // flush ports .set_pc_commit_o (set_pc_ctrl_pcgen), - .flush_unissued_instr_o(flush_unissued_instr_ctrl_id), .flush_if_o (flush_ctrl_if), + .flush_unissued_instr_o(flush_unissued_instr_ctrl_id), .flush_id_o (flush_ctrl_id), .flush_ex_o (flush_ctrl_ex), .flush_bp_o (flush_ctrl_bp), + .flush_icache_o (icache_flush_ctrl_cache), + .flush_dcache_o (dcache_flush_ctrl_cache), + .flush_dcache_ack_i (dcache_flush_ack_cache_ctrl), .flush_tlb_o (flush_tlb_ctrl_ex), .flush_tlb_vvma_o (flush_tlb_vvma_ctrl_ex), .flush_tlb_gvma_o (flush_tlb_gvma_ctrl_ex), - .flush_dcache_o (dcache_flush_ctrl_cache), - .flush_dcache_ack_i (dcache_flush_ack_cache_ctrl), - - .halt_csr_i (halt_csr_ctrl), - .halt_acc_i (halt_acc_ctrl), - .halt_o (halt_ctrl), + .halt_csr_i (halt_csr_ctrl), + .halt_acc_i (halt_acc_ctrl), + .halt_o (halt_ctrl), // control ports - .eret_i (eret), - .ex_valid_i (ex_commit.valid), - .set_debug_pc_i (set_debug_pc), - .flush_csr_i (flush_csr_ctrl), - .resolved_branch_i(resolved_branch), - .fence_i_i (fence_i_commit_controller), - .fence_i (fence_commit_controller), - .sfence_vma_i (sfence_vma_commit_controller), - .hfence_vvma_i (hfence_vvma_commit_controller), - .hfence_gvma_i (hfence_gvma_commit_controller), - .flush_commit_i (flush_commit), - .flush_acc_i (flush_acc), - - .flush_icache_o(icache_flush_ctrl_cache), - .* + .eret_i (eret), + .ex_valid_i (ex_commit.valid), + .set_debug_pc_i (set_debug_pc), + .resolved_branch_i (resolved_branch), + .flush_csr_i (flush_csr_ctrl), + .fence_i_i (fence_i_commit_controller), + .fence_i (fence_commit_controller), + .sfence_vma_i (sfence_vma_commit_controller), + .hfence_vvma_i (hfence_vvma_commit_controller), + .hfence_gvma_i (hfence_gvma_commit_controller), + .flush_commit_i (flush_commit), + .flush_acc_i (flush_acc) ); // ------------------- @@ -1422,6 +1514,7 @@ module cva6 assign halt_acc_ctrl = '0; assign stall_st_pending_ex = '0; assign flush_acc = '0; + assign single_step_acc_commit = '0; // D$ connection is unused assign dcache_req_ports_acc_cache = '0; @@ -1432,7 +1525,6 @@ module cva6 // Feed through cvxif assign cvxif_req_o = cvxif_req; - assign cvxif_resp = cvxif_resp_i; end : gen_no_accelerator // ------------------- @@ -1629,7 +1721,7 @@ module cva6 .lsu_ctrl_i (rvfi_lsu_ctrl), .wbdata_i (wbdata_ex_id), - .commit_ack_i(commit_macro_ack), + .commit_ack_i(commit_ack), .mem_paddr_i (rvfi_mem_paddr), .debug_mode_i(debug_mode), .wdata_i (wdata_commit_id), diff --git a/core/cva6_fifo_v3.sv b/core/cva6_fifo_v3.sv index 25bb881e53..4a9a560864 100644 --- a/core/cva6_fifo_v3.sv +++ b/core/cva6_fifo_v3.sv @@ -1,4 +1,5 @@ // Copyright 2018 ETH Zurich and University of Bologna. +// Copyright 2024 - PlanV Technologies for additionnal contribution. // Copyright and related rights are licensed under the Solderpad Hardware // License, Version 0.51 (the "License"); you may not use this file except in // compliance with the License. You may obtain a copy of the License at @@ -9,9 +10,12 @@ // specific language governing permissions and limitations under the License. // Author: Florian Zaruba +// Additional contributions by: +// Angela Gonzalez - PlanV Technologies module cva6_fifo_v3 #( parameter bit FALL_THROUGH = 1'b0, // fifo is in fall-through mode + parameter bit FPGA_ALTERA = 1'b0, // FPGA Altera optimizations enabled parameter int unsigned DATA_WIDTH = 32, // default data width if the fifo is of type logic parameter int unsigned DEPTH = 8, // depth can be arbitrary from 0 to 2**32 parameter type dtype = logic [DATA_WIDTH-1:0], @@ -46,6 +50,8 @@ module cva6_fifo_v3 #( logic [ADDR_DEPTH:0] status_cnt_n, status_cnt_q; // actual memory dtype [FifoDepth - 1:0] mem_n, mem_q; + dtype data_ft_n, data_ft_q; + logic first_word_n, first_word_q; // fifo ram signals for fpga target logic fifo_ram_we; @@ -71,12 +77,13 @@ module cva6_fifo_v3 #( read_pointer_n = read_pointer_q; write_pointer_n = write_pointer_q; status_cnt_n = status_cnt_q; + data_ft_n = data_ft_q; + first_word_n = first_word_q; if (FPGA_EN) begin fifo_ram_we = '0; - fifo_ram_read_address = read_pointer_q; fifo_ram_write_address = '0; fifo_ram_wdata = '0; - data_o = (DEPTH == 0) ? data_i : fifo_ram_rdata; + data_o = (DEPTH == 0) ? data_i : (first_word_q ? data_ft_q : fifo_ram_rdata); end else begin data_o = (DEPTH == 0) ? data_i : mem_q[read_pointer_q]; mem_n = mem_q; @@ -89,6 +96,7 @@ module cva6_fifo_v3 #( fifo_ram_we = 1'b1; fifo_ram_write_address = write_pointer_q; fifo_ram_wdata = data_i; + first_word_n = FPGA_ALTERA && first_word_q && pop_i; end else begin // push the data onto the queue mem_n[write_pointer_q] = data_i; @@ -104,6 +112,8 @@ module cva6_fifo_v3 #( end if (pop_i && ~empty_o) begin + data_ft_n = data_i; + first_word_n = FPGA_EN && FPGA_ALTERA && first_word_q && push_i; // read from the queue is a default assignment // but increment the read pointer... if (read_pointer_n == FifoDepth[ADDR_DEPTH-1:0] - 1) read_pointer_n = '0; @@ -116,14 +126,23 @@ module cva6_fifo_v3 #( if (push_i && pop_i && ~full_o && ~empty_o) status_cnt_n = status_cnt_q; // FIFO is in pass through mode -> do not change the pointers - if (FALL_THROUGH && (status_cnt_q == 0) && push_i) begin - data_o = data_i; + if ((FALL_THROUGH || (FPGA_EN && FPGA_ALTERA)) && (status_cnt_q == 0) && push_i) begin + if (FALL_THROUGH) data_o = data_i; + if (FPGA_EN && FPGA_ALTERA) begin + data_ft_n = data_i; + first_word_n = '1; + end if (pop_i) begin + first_word_n = '0; status_cnt_n = status_cnt_q; read_pointer_n = read_pointer_q; write_pointer_n = write_pointer_q; end end + + if (FPGA_EN) fifo_ram_read_address = (FPGA_ALTERA == 1) ? read_pointer_n : read_pointer_q; + else fifo_ram_read_address = '0; + end // sequential process @@ -132,32 +151,53 @@ module cva6_fifo_v3 #( read_pointer_q <= '0; write_pointer_q <= '0; status_cnt_q <= '0; + first_word_q <= '0; + data_ft_q <= '0; end else begin if (flush_i) begin read_pointer_q <= '0; write_pointer_q <= '0; status_cnt_q <= '0; + if (FPGA_ALTERA) first_word_q <= '0; + if (FPGA_ALTERA) data_ft_q <= '0; end else begin read_pointer_q <= read_pointer_n; write_pointer_q <= write_pointer_n; status_cnt_q <= status_cnt_n; + if (FPGA_ALTERA) data_ft_q <= data_ft_n; + if (FPGA_ALTERA) first_word_q <= first_word_n; end end end if (FPGA_EN) begin : gen_fpga_queue - AsyncDpRam #( - .ADDR_WIDTH(ADDR_DEPTH), - .DATA_DEPTH(DEPTH), - .DATA_WIDTH($bits(dtype)) - ) fifo_ram ( - .Clk_CI (clk_i), - .WrEn_SI (fifo_ram_we), - .RdAddr_DI(fifo_ram_read_address), - .WrAddr_DI(fifo_ram_write_address), - .WrData_DI(fifo_ram_wdata), - .RdData_DO(fifo_ram_rdata) - ); + if (FPGA_ALTERA) begin + SyncDpRam_ind_r_w #( + .ADDR_WIDTH(ADDR_DEPTH), + .DATA_DEPTH(DEPTH), + .DATA_WIDTH($bits(dtype)) + ) fifo_ram ( + .Clk_CI (clk_i), + .WrEn_SI (fifo_ram_we), + .RdAddr_DI(fifo_ram_read_address), + .WrAddr_DI(fifo_ram_write_address), + .WrData_DI(fifo_ram_wdata), + .RdData_DO(fifo_ram_rdata) + ); + end else begin + AsyncDpRam #( + .ADDR_WIDTH(ADDR_DEPTH), + .DATA_DEPTH(DEPTH), + .DATA_WIDTH($bits(dtype)) + ) fifo_ram ( + .Clk_CI (clk_i), + .WrEn_SI (fifo_ram_we), + .RdAddr_DI(fifo_ram_read_address), + .WrAddr_DI(fifo_ram_write_address), + .WrData_DI(fifo_ram_wdata), + .RdData_DO(fifo_ram_rdata) + ); + end end else begin : gen_asic_queue always_ff @(posedge clk_i or negedge rst_ni) begin if (~rst_ni) begin diff --git a/core/cva6_mmu/cva6_mmu.sv b/core/cva6_mmu/cva6_mmu.sv index 39826da371..9afe90c17e 100644 --- a/core/cva6_mmu/cva6_mmu.sv +++ b/core/cva6_mmu/cva6_mmu.sv @@ -99,8 +99,8 @@ module cva6_mmu output dcache_req_i_t req_port_o, // PMP - input riscv::pmpcfg_t [CVA6Cfg.NrPMPEntries:0] pmpcfg_i, - input logic [CVA6Cfg.NrPMPEntries:0][CVA6Cfg.PLEN-3:0] pmpaddr_i + input riscv::pmpcfg_t [CVA6Cfg.NrPMPEntries-1:0] pmpcfg_i, + input logic [CVA6Cfg.NrPMPEntries-1:0][CVA6Cfg.PLEN-3:0] pmpaddr_i ); // memory management, pte for cva6 @@ -193,12 +193,15 @@ module cva6_mmu .lu_asid_i (itlb_lu_asid), .lu_vmid_i (vmid_i), .lu_vaddr_i (icache_areq_i.fetch_vaddr), + .lu_gpaddr_o (itlb_gpaddr), .lu_content_o (itlb_content), .lu_g_content_o(itlb_g_content), - .lu_gpaddr_o (itlb_gpaddr), + .asid_to_be_flushed_i, + .vmid_to_be_flushed_i, + .vaddr_to_be_flushed_i, + .gpaddr_to_be_flushed_i, .lu_is_page_o (itlb_is_page), - .lu_hit_o (itlb_lu_hit), - .* + .lu_hit_o (itlb_lu_hit) ); cva6_tlb #( @@ -221,12 +224,15 @@ module cva6_mmu .lu_asid_i (itlb_lu_asid), .lu_vmid_i (vmid_i), .lu_vaddr_i (lsu_vaddr_i), + .lu_gpaddr_o (dtlb_gpaddr), .lu_content_o (dtlb_content), .lu_g_content_o(dtlb_g_content), - .lu_gpaddr_o (dtlb_gpaddr), + .asid_to_be_flushed_i, + .vmid_to_be_flushed_i, + .vaddr_to_be_flushed_i, + .gpaddr_to_be_flushed_i, .lu_is_page_o (dtlb_is_page), - .lu_hit_o (dtlb_lu_hit), - .* + .lu_hit_o (dtlb_lu_hit) ); @@ -290,6 +296,7 @@ module cva6_mmu ) i_ptw ( .clk_i (clk_i), .rst_ni(rst_ni), + .flush_i, .ptw_active_o (ptw_active), .walking_instr_o (walking_instr), @@ -297,17 +304,27 @@ module cva6_mmu .ptw_error_at_g_st_o (ptw_error_at_g_st), .ptw_err_at_g_int_st_o (ptw_err_at_g_int_st), .ptw_access_exception_o(ptw_access_exception), + .enable_translation_i, + .enable_g_translation_i, + .en_ld_st_translation_i, + .en_ld_st_g_translation_i, + .v_i, + .ld_st_v_i, + .hlvx_inst_i (hlvx_inst_i), .lsu_is_store_i(lsu_is_store_i), // PTW memory interface .req_port_i (req_port_i), .req_port_o (req_port_o), - .update_vaddr_o(update_vaddr), - // to Shared TLB, update logic .shared_tlb_update_o(update_shared_tlb), + .update_vaddr_o(update_vaddr), + + .asid_i, + .vs_asid_i, + .vmid_i, // from shared TLB // did we miss? @@ -317,7 +334,11 @@ module cva6_mmu .itlb_req_i(itlb_req), - .hlvx_inst_i(hlvx_inst_i), + .satp_ppn_i, + .vsatp_ppn_i, + .hgatp_ppn_i, + .mxr_i, + .vmxr_i, // Performance counters .shared_tlb_miss_o(shared_tlb_miss), //open for now @@ -326,9 +347,7 @@ module cva6_mmu .pmpcfg_i (pmpcfg_i), .pmpaddr_i (pmpaddr_i), .bad_paddr_o(ptw_bad_paddr), - .bad_gpaddr_o(ptw_bad_gpaddr), - .* - + .bad_gpaddr_o(ptw_bad_gpaddr) ); //----------------------- diff --git a/core/cva6_mmu/cva6_ptw.sv b/core/cva6_mmu/cva6_ptw.sv index 34c2cf1a19..c4713701dd 100644 --- a/core/cva6_mmu/cva6_ptw.sv +++ b/core/cva6_mmu/cva6_ptw.sv @@ -83,12 +83,10 @@ module cva6_ptw output logic shared_tlb_miss_o, // PMP - - input riscv::pmpcfg_t [CVA6Cfg.NrPMPEntries:0] pmpcfg_i, - input logic [CVA6Cfg.NrPMPEntries:0][CVA6Cfg.PLEN-3:0] pmpaddr_i, + input riscv::pmpcfg_t [CVA6Cfg.NrPMPEntries-1:0] pmpcfg_i, + input logic [CVA6Cfg.NrPMPEntries-1:0][CVA6Cfg.PLEN-3:0] pmpaddr_i, output logic [CVA6Cfg.PLEN-1:0] bad_paddr_o, output logic [CVA6Cfg.GPLEN-1:0] bad_gpaddr_o - ); // input registers diff --git a/core/cva6_rvfi.sv b/core/cva6_rvfi.sv index 504841f831..83e197e0c5 100644 --- a/core/cva6_rvfi.sv +++ b/core/cva6_rvfi.sv @@ -67,6 +67,8 @@ module cva6_rvfi logic [CVA6Cfg.NrIssuePorts-1:0][CVA6Cfg.XLEN-1:0] rs1_forwarding; logic [CVA6Cfg.NrIssuePorts-1:0][CVA6Cfg.XLEN-1:0] rs2_forwarding; + logic [CVA6Cfg.NrCommitPorts-1:0][CVA6Cfg.XLEN-1:0] rvfi_intr; + logic [CVA6Cfg.NrCommitPorts-1:0][CVA6Cfg.VLEN-1:0] commit_instr_pc; fu_op [CVA6Cfg.NrCommitPorts-1:0] commit_instr_op; logic [CVA6Cfg.NrCommitPorts-1:0][REG_ADDR_SIZE-1:0] commit_instr_rs1; @@ -274,20 +276,34 @@ module cva6_rvfi always_ff @(posedge clk_i) begin for (int i = 0; i < CVA6Cfg.NrCommitPorts; i++) begin logic exception; - exception = commit_instr_valid[i] && ex_commit_valid && !commit_drop[i]; - rvfi_instr_o[i].valid <= (commit_ack[i] && !ex_commit_valid && !commit_drop[i]) || + logic valid; + exception = (i == 0) && commit_instr_valid[i] && ex_commit_valid && !commit_drop[i]; + valid = (commit_ack[i] && !ex_commit_valid && !commit_drop[i]) || (exception && (ex_commit_cause == riscv::ENV_CALL_MMODE || ex_commit_cause == riscv::ENV_CALL_SMODE || ex_commit_cause == riscv::ENV_CALL_UMODE)); - rvfi_instr_o[i].insn <= mem_q[commit_pointer[i]].instr; + rvfi_instr_o[i].valid <= valid; + rvfi_instr_o[i].insn <= mem_q[commit_pointer[i]].instr; // when trap, the instruction is not executed - rvfi_instr_o[i].trap <= exception; + rvfi_instr_o[i].trap <= exception; + + if (exception && ex_commit_cause[31]) begin + rvfi_intr[i] <= 'b101; + end else if (exception) begin + rvfi_intr[i] <= 'b11; + end + if (valid) begin + rvfi_intr[i] <= 0; + end + + rvfi_instr_o[i].intr <= rvfi_intr[i]; + rvfi_instr_o[i].cause <= ex_commit_cause; rvfi_instr_o[i].mode <= (CVA6Cfg.DebugEn && debug_mode) ? 2'b10 : priv_lvl; rvfi_instr_o[i].ixl <= CVA6Cfg.XLEN == 64 ? 2 : 1; - rvfi_instr_o[i].rs1_addr <= commit_instr_rs1[i][4:0]; - rvfi_instr_o[i].rs2_addr <= commit_instr_rs2[i][4:0]; - rvfi_instr_o[i].rd_addr <= commit_instr_rd[i][4:0]; + rvfi_instr_o[i].rs1_addr <= commit_instr_rs1[i]; + rvfi_instr_o[i].rs2_addr <= commit_instr_rs2[i]; + rvfi_instr_o[i].rd_addr <= commit_instr_rd[i]; rvfi_instr_o[i].rd_wdata <= (CVA6Cfg.FpPresent && is_rd_fpr( commit_instr_op[i] )) ? commit_instr_result[i] : wdata[i]; diff --git a/core/cvxif_compressed_if_driver.sv b/core/cvxif_compressed_if_driver.sv new file mode 100644 index 0000000000..e874b9b190 --- /dev/null +++ b/core/cvxif_compressed_if_driver.sv @@ -0,0 +1,74 @@ +// Copyright 2024 Thales DIS France SAS +// +// Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0 +// You may obtain a copy of the License at https://solderpad.org/licenses/ +// +// Original Author: Guillaume Chauvon + +module cvxif_compressed_if_driver #( + parameter config_pkg::cva6_cfg_t CVA6Cfg = config_pkg::cva6_cfg_empty, + parameter type x_compressed_req_t = logic, + parameter type x_compressed_resp_t = logic +) ( + // Subsystem Clock - SUBSYSTEM + input logic clk_i, + // Asynchronous reset active low - SUBSYSTEM + input logic rst_ni, + // CVA6 Hart id + input logic [CVA6Cfg.XLEN-1:0] hart_id_i, + + input logic [CVA6Cfg.NrIssuePorts-1:0] is_compressed_i, + input logic [CVA6Cfg.NrIssuePorts-1:0] is_illegal_i, + input logic [CVA6Cfg.NrIssuePorts-1:0][31:0] instruction_i, + + output logic [CVA6Cfg.NrIssuePorts-1:0] is_compressed_o, + output logic [CVA6Cfg.NrIssuePorts-1:0] is_illegal_o, + output logic [CVA6Cfg.NrIssuePorts-1:0][31:0] instruction_o, + input logic stall_i, + output logic stall_o, + // CVXIF Compressed interface + input logic compressed_ready_i, + input x_compressed_resp_t compressed_resp_i, + output logic compressed_valid_o, + output x_compressed_req_t compressed_req_o +); + + + always_comb begin + is_illegal_o = is_illegal_i; + instruction_o = instruction_i; + is_compressed_o = is_compressed_i; + compressed_valid_o = 1'b0; + compressed_req_o.instr = '0; + compressed_req_o.hartid = hart_id_i; + stall_o = stall_i; + if (is_illegal_i[0]) begin + compressed_valid_o = is_illegal_i[0]; + compressed_req_o.instr = instruction_i[0][15:0]; + is_illegal_o[0] = ~compressed_resp_i.accept; + instruction_o[0] = compressed_resp_i.accept ? compressed_resp_i.instr : instruction_i[0]; + is_compressed_o[0] = compressed_resp_i.accept ? 1'b0 : is_compressed_i[0]; + if (~stall_i) begin + // Propagate stall from macro decoder or wait for compressed ready if compressed transaction is happening. + // Stall if both instruction are illegal + if (CVA6Cfg.SuperscalarEn) begin + stall_o = is_illegal_i[1]; + end else begin + stall_o = (compressed_valid_o && ~compressed_ready_i); + end + end + end + if (CVA6Cfg.SuperscalarEn) begin + if (~is_illegal_i[0] && is_illegal_i[1]) begin // 2nd instruction is illegal + compressed_valid_o = is_illegal_i[1]; + compressed_req_o.instr = instruction_i[1][15:0]; + is_illegal_o[1] = ~compressed_resp_i.accept; + instruction_o[1] = compressed_resp_i.accept ? compressed_resp_i.instr : instruction_i[1]; + is_compressed_o[1] = compressed_resp_i.accept ? 1'b0 : is_compressed_i[1]; + end + end + end + +endmodule diff --git a/core/cvxif_example/compressed_instr_decoder.sv b/core/cvxif_example/compressed_instr_decoder.sv new file mode 100644 index 0000000000..de2fbb391c --- /dev/null +++ b/core/cvxif_example/compressed_instr_decoder.sv @@ -0,0 +1,49 @@ +// Copyright 2024 Thales DIS France SAS +// +// Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0 +// You may obtain a copy of the License at https://solderpad.org/licenses/ +// +// Original Author: Guillaume Chauvon + +module compressed_instr_decoder #( + parameter type copro_compressed_resp_t = logic, + parameter int NbInstr = 1, + parameter copro_compressed_resp_t CoproInstr [NbInstr] = {0}, + parameter type x_compressed_req_t = logic, + parameter type x_compressed_resp_t = logic +) ( + input logic clk_i, + input logic rst_ni, + input logic compressed_valid_i, + input x_compressed_req_t compressed_req_i, + output logic compressed_ready_o, + output x_compressed_resp_t compressed_resp_o +); + + logic [NbInstr-1:0] sel; + + for (genvar i = 0; i < NbInstr; i++) begin : gen_predecoder_selector + assign sel[i] = ((CoproInstr[i].mask & compressed_req_i.instr) == CoproInstr[i].instr); + end + + always_comb begin + compressed_ready_o = '1; + compressed_resp_o.accept = '0; + compressed_resp_o.instr = '0; + for (int unsigned i = 0; i < NbInstr; i++) begin + if (sel[i] && compressed_valid_i) begin + compressed_resp_o.accept = CoproInstr[i].resp.accept; + compressed_resp_o.instr = CoproInstr[i].resp.instr; + // Remap rs1 and rs2 + compressed_resp_o.instr[19:15] = compressed_req_i.instr[11:7]; + compressed_resp_o.instr[24:20] = compressed_req_i.instr[6:2]; + end + end + end + + assert property (@(posedge clk_i) $onehot0(sel)) + else $warning("This offloaded instruction is valid for multiple coprocessor instructions !"); + +endmodule diff --git a/core/cvxif_example/copro_alu.sv b/core/cvxif_example/copro_alu.sv new file mode 100644 index 0000000000..672528454d --- /dev/null +++ b/core/cvxif_example/copro_alu.sv @@ -0,0 +1,136 @@ +// Copyright 2024 Thales DIS France SAS +// +// Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0 +// You may obtain a copy of the License at https://solderpad.org/licenses/ +// +// Original Author: Guillaume Chauvon + +module copro_alu + import cvxif_instr_pkg::*; +#( + parameter int unsigned NrRgprPorts = 2, + parameter int unsigned XLEN = 32, + parameter type hartid_t = logic, + parameter type id_t = logic, + parameter type registers_t = logic + +) ( + input logic clk_i, + input logic rst_ni, + input registers_t registers_i, + input opcode_t opcode_i, + input hartid_t hartid_i, + input id_t id_i, + input logic [ 4:0] rd_i, + output logic [XLEN-1:0] result_o, + output hartid_t hartid_o, + output id_t id_o, + output logic [ 4:0] rd_o, + output logic valid_o, + output logic we_o +); + + logic [XLEN-1:0] result_n, result_q; + hartid_t hartid_n, hartid_q; + id_t id_n, id_q; + logic valid_n, valid_q; + logic [4:0] rd_n, rd_q; + logic we_n, we_q; + + assign result_o = result_q; + assign hartid_o = hartid_q; + assign id_o = id_q; + assign valid_o = valid_q; + assign rd_o = rd_q; + assign we_o = we_q; + + always_comb begin + case (opcode_i) + cvxif_instr_pkg::NOP: begin + result_n = '0; + hartid_n = hartid_i; + id_n = id_i; + valid_n = 1'b1; + rd_n = '0; + we_n = '0; + end + cvxif_instr_pkg::ADD: begin + result_n = registers_i[1] + registers_i[0]; + hartid_n = hartid_i; + id_n = id_i; + valid_n = 1'b1; + rd_n = rd_i; + we_n = 1'b1; + end + cvxif_instr_pkg::DOUBLE_RS1: begin + result_n = registers_i[0] + registers_i[0]; + hartid_n = hartid_i; + id_n = id_i; + valid_n = 1'b1; + rd_n = rd_i; + we_n = 1'b1; + end + cvxif_instr_pkg::DOUBLE_RS2: begin + result_n = registers_i[1] + registers_i[1]; + hartid_n = hartid_i; + id_n = id_i; + valid_n = 1'b1; + rd_n = rd_i; + we_n = 1'b1; + end + cvxif_instr_pkg::ADD_MULTI: begin + result_n = registers_i[1] + registers_i[0]; + hartid_n = hartid_i; + id_n = id_i; + valid_n = 1'b1; + rd_n = rd_i; + we_n = 1'b1; + end + cvxif_instr_pkg::ADD_RS3_R4: begin + result_n = NrRgprPorts == 3 ? registers_i[2] + registers_i[1] + registers_i[0] : registers_i[1] + registers_i[0]; + hartid_n = hartid_i; + id_n = id_i; + valid_n = 1'b1; + rd_n = rd_i; + we_n = 1'b1; + end + cvxif_instr_pkg::ADD_RS3_R: begin + result_n = NrRgprPorts == 3 ? registers_i[2] + registers_i[1] + registers_i[0] : registers_i[1] + registers_i[0]; + hartid_n = hartid_i; + id_n = id_i; + valid_n = 1'b1; + rd_n = 5'b01010; + we_n = 1'b1; + end + default: begin + result_n = '0; + hartid_n = '0; + id_n = '0; + valid_n = '0; + rd_n = '0; + we_n = '0; + end + endcase + end + + always_ff @(posedge clk_i, negedge rst_ni) begin + if (~rst_ni) begin + result_q <= '0; + hartid_q <= '0; + id_q <= '0; + valid_q <= '0; + rd_q <= '0; + we_q <= '0; + end else begin + result_q <= result_n; + hartid_q <= hartid_n; + id_q <= id_n; + valid_q <= valid_n; + rd_q <= rd_n; + we_q <= we_n; + end + end + +endmodule diff --git a/core/cvxif_example/cvxif_example_coprocessor.sv b/core/cvxif_example/cvxif_example_coprocessor.sv index 921fdfd0e0..614b17e850 100644 --- a/core/cvxif_example/cvxif_example_coprocessor.sv +++ b/core/cvxif_example/cvxif_example_coprocessor.sv @@ -1,19 +1,32 @@ -// Copyright 2021 Thales DIS design services SAS +// Copyright 2024 Thales DIS France SAS // // Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0 // You may obtain a copy of the License at https://solderpad.org/licenses/ // -// Original Author: Guillaume Chauvon (guillaume.chauvon@thalesgroup.com) -// Example coprocessor adds rs1,rs2(,rs3) together and gives back the result to the CPU via the CoreV-X-Interface. -// Coprocessor delays the sending of the result depending on result least significant bits. +// Original Author: Guillaume Chauvon module cvxif_example_coprocessor - import cvxif_pkg::*; import cvxif_instr_pkg::*; #( - parameter config_pkg::cva6_cfg_t CVA6Cfg = config_pkg::cva6_cfg_empty + // CVXIF Types + parameter int unsigned NrRgprPorts = 2, + parameter int unsigned XLEN = 32, + parameter type readregflags_t = logic, + parameter type writeregflags_t = logic, + parameter type id_t = logic, + parameter type hartid_t = logic, + parameter type x_compressed_req_t = logic, + parameter type x_compressed_resp_t = logic, + parameter type x_issue_req_t = logic, + parameter type x_issue_resp_t = logic, + parameter type x_register_t = logic, + parameter type x_commit_t = logic, + parameter type x_result_t = logic, + parameter type cvxif_req_t = logic, + parameter type cvxif_resp_t = logic, + localparam type registers_t = logic [NrRgprPorts-1:0][XLEN-1:0] ) ( input logic clk_i, // Clock input logic rst_ni, // Asynchronous reset active low @@ -21,138 +34,119 @@ module cvxif_example_coprocessor output cvxif_resp_t cvxif_resp_o ); - //Compressed interface - logic x_compressed_valid_i; - logic x_compressed_ready_o; - x_compressed_req_t x_compressed_req_i; - x_compressed_resp_t x_compressed_resp_o; - //Issue interface - logic x_issue_valid_i; - logic x_issue_ready_o; - x_issue_req_t x_issue_req_i; - x_issue_resp_t x_issue_resp_o; - //Commit interface - logic x_commit_valid_i; - x_commit_t x_commit_i; - //Memory interface - logic x_mem_valid_o; - logic x_mem_ready_i; - x_mem_req_t x_mem_req_o; - x_mem_resp_t x_mem_resp_i; - //Memory result interface - logic x_mem_result_valid_i; - x_mem_result_t x_mem_result_i; - //Result interface - logic x_result_valid_o; - logic x_result_ready_i; - x_result_t x_result_o; - - assign x_compressed_valid_i = cvxif_req_i.x_compressed_valid; - assign x_compressed_req_i = cvxif_req_i.x_compressed_req; - assign x_issue_valid_i = cvxif_req_i.x_issue_valid; - assign x_issue_req_i = cvxif_req_i.x_issue_req; - assign x_commit_valid_i = cvxif_req_i.x_commit_valid; - assign x_commit_i = cvxif_req_i.x_commit; - assign x_mem_ready_i = cvxif_req_i.x_mem_ready; - assign x_mem_resp_i = cvxif_req_i.x_mem_resp; - assign x_mem_result_valid_i = cvxif_req_i.x_mem_result_valid; - assign x_mem_result_i = cvxif_req_i.x_mem_result; - assign x_result_ready_i = cvxif_req_i.x_result_ready; - - assign cvxif_resp_o.x_compressed_ready = x_compressed_ready_o; - assign cvxif_resp_o.x_compressed_resp = x_compressed_resp_o; - assign cvxif_resp_o.x_issue_ready = x_issue_ready_o; - assign cvxif_resp_o.x_issue_resp = x_issue_resp_o; - assign cvxif_resp_o.x_mem_valid = x_mem_valid_o; - assign cvxif_resp_o.x_mem_req = x_mem_req_o; - assign cvxif_resp_o.x_result_valid = x_result_valid_o; - assign cvxif_resp_o.x_result = x_result_o; - - //Compressed interface - assign x_compressed_ready_o = '0; - assign x_compressed_resp_o.instr = '0; - assign x_compressed_resp_o.accept = '0; + // Compressed interface signals + x_compressed_req_t compressed_req; + x_compressed_resp_t compressed_resp; + logic compressed_valid, compressed_ready; + // Issue interface signals + x_issue_req_t issue_req; + x_issue_resp_t issue_resp; + logic issue_valid, issue_ready; + + // Register interface signals + x_register_t register; + logic register_valid; + + // Decoder and alu signals + registers_t registers; + opcode_t opcode; + hartid_t issue_hartid, hartid; + id_t issue_id, id; + logic [4:0] issue_rd, rd; + logic [XLEN-1:0] result; + logic we; + + // Issue and Register interface + // Mandatory when X_ISSUE_REGISTER_SPLIT = 0 + assign cvxif_resp_o.compressed_ready = compressed_ready; + assign cvxif_resp_o.compressed_resp = compressed_resp; + assign cvxif_resp_o.issue_ready = issue_ready; + assign cvxif_resp_o.issue_resp = issue_resp; + assign cvxif_resp_o.register_ready = cvxif_resp_o.issue_ready; + + assign compressed_req = cvxif_req_i.compressed_req; + assign compressed_valid = cvxif_req_i.compressed_valid; + assign issue_req = cvxif_req_i.issue_req; + assign issue_valid = cvxif_req_i.issue_valid; + assign register = cvxif_req_i.register; + assign register_valid = cvxif_req_i.register_valid; + + compressed_instr_decoder #( + .copro_compressed_resp_t(cvxif_instr_pkg::copro_compressed_resp_t), + .NbInstr(cvxif_instr_pkg::NbCompInstr), + .CoproInstr(cvxif_instr_pkg::CoproCompInstr), + .x_compressed_req_t(x_compressed_req_t), + .x_compressed_resp_t(x_compressed_resp_t) + ) compressed_instr_decoder_i ( + .clk_i (clk_i), + .rst_ni (rst_ni), + .compressed_valid_i(compressed_valid), + .compressed_req_i (compressed_req), + .compressed_ready_o(compressed_ready), + .compressed_resp_o (compressed_resp) + ); instr_decoder #( + .copro_issue_resp_t (cvxif_instr_pkg::copro_issue_resp_t), + .opcode_t (cvxif_instr_pkg::opcode_t), .NbInstr (cvxif_instr_pkg::NbInstr), - .CoproInstr(cvxif_instr_pkg::CoproInstr) + .CoproInstr(cvxif_instr_pkg::CoproInstr), + .NrRgprPorts(NrRgprPorts), + .hartid_t (hartid_t), + .id_t (id_t), + .x_issue_req_t (x_issue_req_t), + .x_issue_resp_t (x_issue_resp_t), + .x_register_t (x_register_t), + .registers_t (registers_t) ) instr_decoder_i ( - .clk_i (clk_i), - .x_issue_req_i (x_issue_req_i), - .x_issue_resp_o(x_issue_resp_o) + .clk_i (clk_i), + .rst_ni (rst_ni), + .issue_valid_i (issue_valid), + .issue_req_i (issue_req), + .issue_ready_o (issue_ready), + .issue_resp_o (issue_resp), + .register_valid_i(register_valid), + .register_i (register), + .registers_o (registers), + .opcode_o (opcode), + .hartid_o (issue_hartid), + .id_o (issue_id), + .rd_o (issue_rd) ); - typedef struct packed { - x_issue_req_t req; - x_issue_resp_t resp; - } x_issue_t; - - logic fifo_full, fifo_empty; - logic x_issue_ready_q; - logic instr_push, instr_pop; - x_issue_t req_i; - x_issue_t req_o; - - - - assign instr_push = x_issue_resp_o.accept ? 1 : 0; - assign instr_pop = (x_commit_i.x_commit_kill && x_commit_valid_i) || x_result_valid_o; - assign x_issue_ready_q = ~fifo_full; // if something is in the fifo, the instruction is being processed - // so we can't receive anything else - assign req_i.req = x_issue_req_i; - assign req_i.resp = x_issue_resp_o; - - always_ff @(posedge clk_i or negedge rst_ni) begin : regs - if (!rst_ni) begin - x_issue_ready_o <= 1; - end else begin - x_issue_ready_o <= x_issue_ready_q; - end - end - - cva6_fifo_v3 #( - .FALL_THROUGH(1), //data_o ready and pop in the same cycle - .DATA_WIDTH (64), - .DEPTH (8), - .dtype (x_issue_t), - .FPGA_EN (CVA6Cfg.FpgaEn) - ) fifo_commit_i ( - .clk_i (clk_i), - .rst_ni (rst_ni), - .flush_i (1'b0), - .testmode_i(1'b0), - .full_o (fifo_full), - .empty_o (fifo_empty), - .usage_o (), - .data_i (req_i), - .push_i (instr_push), - .data_o (req_o), - .pop_i (instr_pop) - ); - - logic [3:0] c; - counter #( - .WIDTH(4) - ) counter_i ( - .clk_i (clk_i), - .rst_ni (rst_ni), - .clear_i (~x_commit_i.x_commit_kill && x_commit_valid_i), - .en_i (1'b1), - .load_i (), - .down_i (), - .d_i (), - .q_o (c), - .overflow_o() + logic alu_valid; + // Result interface + copro_alu #( + .NrRgprPorts(NrRgprPorts), + .XLEN(XLEN), + .hartid_t(hartid_t), + .id_t(id_t), + .registers_t(registers_t) + ) i_copro_alu ( + .clk_i (clk_i), + .rst_ni (rst_ni), + .registers_i(registers), + .opcode_i (opcode), + .hartid_i (issue_hartid), + .id_i (issue_id), + .rd_i (issue_rd), + .hartid_o (hartid), + .id_o (id), + .result_o (result), + .valid_o (alu_valid), + .rd_o (rd), + .we_o (we) ); always_comb begin - x_result_o.data = req_o.req.rs[0] + req_o.req.rs[1] + (X_NUM_RS == 3 ? req_o.req.rs[2] : 0); - x_result_valid_o = (c == x_result_o.data[3:0]) && ~fifo_empty ? 1 : 0; - x_result_o.id = req_o.req.id; - x_result_o.rd = req_o.req.instr[11:7]; - x_result_o.we = req_o.resp.writeback & x_result_valid_o; - x_result_o.exc = 0; - x_result_o.exccode = 0; + cvxif_resp_o.result_valid = alu_valid; //TODO Should wait for ready from CPU + cvxif_resp_o.result.hartid = hartid; + cvxif_resp_o.result.id = id; + cvxif_resp_o.result.data = result; + cvxif_resp_o.result.rd = rd; + cvxif_resp_o.result.we = we; end + + endmodule diff --git a/core/cvxif_example/include/cvxif_instr_pkg.sv b/core/cvxif_example/include/cvxif_instr_pkg.sv index 035cb0488f..2c0c8a34bd 100644 --- a/core/cvxif_example/include/cvxif_instr_pkg.sv +++ b/core/cvxif_example/include/cvxif_instr_pkg.sv @@ -7,40 +7,144 @@ // // Original Author: Guillaume Chauvon (guillaume.chauvon@thalesgroup.com) + + package cvxif_instr_pkg; + typedef enum logic [3:0] { + ILLEGAL = 4'b0000, + NOP = 4'b0001, + ADD = 4'b0010, + DOUBLE_RS1 = 4'b0011, + DOUBLE_RS2 = 4'b0100, + ADD_MULTI = 4'b0101, + ADD_RS3_R4 = 4'b0110, + ADD_RS3_R = 4'b0111 + } opcode_t; + + typedef struct packed { - logic [31:0] instr; - logic [31:0] mask; - cvxif_pkg::x_issue_resp_t resp; + logic accept; + logic writeback; // TODO depends on dualwrite + logic [2:0] register_read; // TODO Nr read ports + } issue_resp_t; + + typedef struct packed { + logic accept; + logic [31:0] instr; + } compressed_resp_t; + + typedef struct packed { + logic [31:0] instr; + logic [31:0] mask; + issue_resp_t resp; + opcode_t opcode; } copro_issue_resp_t; - // 2 Possible RISCV instructions for Coprocessor - parameter int unsigned NbInstr = 2; + + typedef struct packed { + logic [15:0] instr; + logic [15:0] mask; + compressed_resp_t resp; + } copro_compressed_resp_t; + + // 4 Possible RISCV instructions for Coprocessor + parameter int unsigned NbInstr = 10; parameter copro_issue_resp_t CoproInstr[NbInstr] = '{ '{ - instr: 32'b00000_00_00000_00000_0_00_00000_0101011, // custom1 opcode - mask: 32'b00000_00_00000_00000_0_00_00000_1111111, - resp : '{ - accept : 1'b1, - writeback : 1'b0, - dualwrite : 1'b0, - dualread : 1'b0, - loadstore : 1'b0, - exc : 1'b0 - } - }, - '{ - instr: 32'b00000_00_00000_00000_0_00_00000_1011011, // custom2 opcode - mask: 32'b00000_00_00000_00000_0_00_00000_1111111, - resp : '{ - accept : 1'b1, - writeback : 1'b1, - dualwrite : 1'b0, - dualread : 1'b0, - loadstore : 1'b0, - exc : 1'b0 - } + // Custom Nop + instr: + 32'b00000_00_00000_00000_0_00_00000_1111011, // custom3 opcode + mask: 32'b11111_11_00000_00000_1_11_00000_1111111, + resp : '{accept : 1'b1, writeback : 1'b0, register_read : {1'b0, 1'b0, 1'b0}}, + opcode : NOP + }, + '{ + // Custom Add : cus_add rd, rs1, rs2 + instr: + 32'b00000_00_00000_00000_0_01_00000_1111011, // custom3 opcode + mask: 32'b11111_11_00000_00000_1_11_00000_1111111, + resp : '{accept : 1'b1, writeback : 1'b1, register_read : {1'b0, 1'b1, 1'b1}}, + opcode : ADD + }, + '{ + // Custom Add rs1 : cus_add rd, rs1, rs1 + instr: + 32'b00000_01_00000_00000_0_01_00000_1111011, // custom3 opcode + mask: 32'b11111_11_00000_00000_1_11_00000_1111111, + resp : '{accept : 1'b1, writeback : 1'b1, register_read : {1'b0, 1'b0, 1'b1}}, + opcode : DOUBLE_RS1 + }, + '{ + // Custom Add rs2 : cus_add rd, rs2, rs2 + instr: + 32'b00000_10_00000_00000_0_01_00000_1111011, // custom3 opcode + mask: 32'b11111_11_00000_00000_1_11_00000_1111111, + resp : '{accept : 1'b1, writeback : 1'b1, register_read : {1'b0, 1'b1, 1'b0}}, + opcode : DOUBLE_RS2 + }, + '{ + // Custom Add Multi rs1 : cus_add rd, rs1, rs1 + instr: + 32'b00000_11_00000_00000_0_01_00000_1111011, // custom3 opcode + mask: 32'b11111_11_00000_00000_1_11_00000_1111111, + resp : '{accept : 1'b1, writeback : 1'b1, register_read : {1'b0, 1'b1, 1'b1}}, + opcode : ADD_MULTI + }, + '{ + // Custom Add Multi rs1 : cus_add rd, rs1, rs1 + instr: + 32'b00001_00_00000_00000_0_01_00000_1111011, // custom3 opcode + mask: 32'b11111_11_00000_00000_1_11_00000_1111111, + resp : '{accept : 1'b1, writeback : 1'b1, register_read : {1'b1, 1'b1, 1'b1}}, + opcode : ADD_RS3_R + }, + '{ + // Custom Add Multi rs1 : cus_add rd, rs1, rs1 + instr: + 32'b00000_00_00000_00000_0_00_00000_1000011, // MADD opcode + mask: 32'b00000_11_00000_00000_1_11_00000_1111111, + resp : '{accept : 1'b1, writeback : 1'b1, register_read : {1'b1, 1'b1, 1'b1}}, + opcode : ADD_RS3_R4 + }, + '{ + // Custom Add Multi rs1 : cus_add rd, rs1, rs1 + instr: + 32'b00000_00_00000_00000_0_00_00000_1000111, // MSUB opcode + mask: 32'b00000_11_00000_00000_1_11_00000_1111111, + resp : '{accept : 1'b1, writeback : 1'b1, register_read : {1'b1, 1'b1, 1'b1}}, + opcode : ADD_RS3_R4 + }, + '{ + // Custom Add Multi rs1 : cus_add rd, rs1, rs1 + instr: + 32'b00000_00_00000_00000_0_00_00000_1001011, // NMSUB opcode + mask: 32'b00000_11_00000_00000_1_11_00000_1111111, + resp : '{accept : 1'b1, writeback : 1'b1, register_read : {1'b1, 1'b1, 1'b1}}, + opcode : ADD_RS3_R4 + }, + '{ + // Custom Add Multi rs1 : cus_add rd, rs1, rs1 + instr: + 32'b00000_00_00000_00000_0_00_00000_1001111, // NMADD opcode + mask: 32'b00000_11_00000_00000_1_11_00000_1111111, + resp : '{accept : 1'b1, writeback : 1'b1, register_read : {1'b1, 1'b1, 1'b1}}, + opcode : ADD_RS3_R4 + } + }; + + parameter int unsigned NbCompInstr = 2; + parameter copro_compressed_resp_t CoproCompInstr[NbCompInstr] = '{ + // C_NOP + '{ + instr : 16'b111_0_00000_00000_00, + mask : 16'b111_1_00000_00000_11, + resp : '{accept : 1'b1, instr : 32'b00000_00_00000_00000_0_00_00000_1111011} + }, + '{ + instr : 16'b111_1_00000_00000_00, + mask : 16'b111_1_00000_00000_11, + resp : '{accept : 1'b1, instr : 32'b00000_00_00000_00000_0_01_01010_1111011} } }; diff --git a/core/cvxif_example/instr_decoder.sv b/core/cvxif_example/instr_decoder.sv index 0cf1bdf32d..8e952b6dfe 100644 --- a/core/cvxif_example/instr_decoder.sv +++ b/core/cvxif_example/instr_decoder.sv @@ -1,46 +1,86 @@ -// Copyright 2021 Thales DIS design services SAS +// Copyright 2024 Thales DIS France SAS // // Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0 // You may obtain a copy of the License at https://solderpad.org/licenses/ // -// Original Author: Guillaume Chauvon (guillaume.chauvon@thalesgroup.com) +// Original Author: Guillaume Chauvon -module instr_decoder - import cvxif_pkg::*; -#( - parameter int NbInstr = 1, - parameter cvxif_instr_pkg::copro_issue_resp_t CoproInstr[NbInstr] = {0} +module instr_decoder #( + parameter type copro_issue_resp_t = logic, + parameter type opcode_t = logic, + parameter int NbInstr = 1, + parameter copro_issue_resp_t CoproInstr [NbInstr] = {0}, + parameter int unsigned NrRgprPorts = 2, + parameter type hartid_t = logic, + parameter type id_t = logic, + parameter type x_issue_req_t = logic, + parameter type x_issue_resp_t = logic, + parameter type x_register_t = logic, + parameter type registers_t = logic ) ( - input logic clk_i, - input x_issue_req_t x_issue_req_i, - output x_issue_resp_t x_issue_resp_o + input logic clk_i, + input logic rst_ni, + input logic issue_valid_i, + input x_issue_req_t issue_req_i, + output logic issue_ready_o, + output x_issue_resp_t issue_resp_o, + input logic register_valid_i, + input x_register_t register_i, + output registers_t registers_o, + output opcode_t opcode_o, + output hartid_t hartid_o, + output id_t id_o, + output logic [4:0] rd_o ); logic [NbInstr-1:0] sel; + logic rs1_ready; + logic rs2_ready; + logic rs3_ready; for (genvar i = 0; i < NbInstr; i++) begin : gen_predecoder_selector - assign sel[i] = ((CoproInstr[i].mask & x_issue_req_i.instr) == CoproInstr[i].instr); + assign sel[i] = ((CoproInstr[i].mask & issue_req_i.instr) == CoproInstr[i].instr); end always_comb begin - x_issue_resp_o.accept = '0; - x_issue_resp_o.writeback = '0; - x_issue_resp_o.dualwrite = '0; - x_issue_resp_o.dualread = '0; - x_issue_resp_o.loadstore = '0; - x_issue_resp_o.exc = '0; + rs1_ready = '0; + rs2_ready = '0; + rs3_ready = '0; + issue_ready_o = '0; + issue_resp_o.accept = '0; + issue_resp_o.writeback = '0; + issue_resp_o.register_read = '0; + registers_o = '0; + opcode_o = opcode_t'(0); // == ILLEGAL see cvxif_instr_pkg.sv + hartid_o = '0; + id_o = '0; + rd_o = '0; for (int unsigned i = 0; i < NbInstr; i++) begin - if (sel[i]) begin - x_issue_resp_o.accept = CoproInstr[i].resp.accept; - x_issue_resp_o.writeback = CoproInstr[i].resp.writeback; - x_issue_resp_o.dualwrite = CoproInstr[i].resp.dualwrite; - x_issue_resp_o.dualread = CoproInstr[i].resp.dualread; - x_issue_resp_o.loadstore = CoproInstr[i].resp.loadstore; - x_issue_resp_o.exc = CoproInstr[i].resp.exc; + if (sel[i] && issue_valid_i) begin + issue_resp_o.accept = CoproInstr[i].resp.accept; + issue_resp_o.writeback = CoproInstr[i].resp.writeback; + issue_resp_o.register_read = CoproInstr[i].resp.register_read; // Warning : potential 3 bits vector into 2 bits one + if (issue_resp_o.accept) begin + rs1_ready = (~CoproInstr[i].resp.register_read[0] || register_i.rs_valid[0]); + rs2_ready = (~CoproInstr[i].resp.register_read[1] || register_i.rs_valid[1]); + rs3_ready = NrRgprPorts == 3 ? (~CoproInstr[i].resp.register_read[2] || register_i.rs_valid[2]) : 1'b1; + issue_ready_o = rs1_ready && rs2_ready && rs3_ready; + end + opcode_o = CoproInstr[i].opcode; + id_o = issue_req_i.id; + hartid_o = issue_req_i.hartid; + rd_o = issue_req_i.instr[11:7]; + for (int unsigned j = 0; j < NrRgprPorts; j++) begin + registers_o[j] = issue_resp_o.register_read[j] ? register_i.rs[j] : '0; + end end end + // Coprocessor could not decode offloaded instruction -> instruction is not accepted + if (issue_valid_i && ~(|sel)) begin + issue_ready_o = 1'b1; + end end assert property (@(posedge clk_i) $onehot0(sel)) diff --git a/core/cvxif_fu.sv b/core/cvxif_fu.sv index 4630be9e1a..b191fbff4e 100644 --- a/core/cvxif_fu.sv +++ b/core/cvxif_fu.sv @@ -1,13 +1,14 @@ -// Copyright 2021 Thales DIS design services SAS +// Copyright 2024 Thales DIS France SAS // // Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0 // You may obtain a copy of the License at https://solderpad.org/licenses/ // -// Original Author: Guillaume CHAUVON (guillaume.chauvon@thalesgroup.com) +// Original Author: Guillaume Chauvon -// Functional Unit for the logic of the CoreV-X-Interface +// Functional Unit for the CoreV-X-Interface +// Handles Result interface and exception forwarding to next stages. module cvxif_fu @@ -15,116 +16,60 @@ module cvxif_fu #( parameter config_pkg::cva6_cfg_t CVA6Cfg = config_pkg::cva6_cfg_empty, parameter type exception_t = logic, - parameter type fu_data_t = logic + parameter type x_result_t = logic ) ( // Subsystem Clock - SUBSYSTEM - input logic clk_i, + input logic clk_i, // Asynchronous reset active low - SUBSYSTEM - input logic rst_ni, - // FU data needed to execute instruction - ISSUE_STAGE - input fu_data_t fu_data_i, - // Current privilege mode - CSR_REGFILE - input riscv::priv_lvl_t priv_lvl_i, + input logic rst_ni, // CVXIF instruction is valid - ISSUE_STAGE - input logic x_valid_i, - // CVXIF is ready - ISSUE_STAGE - output logic x_ready_o, + input logic x_valid_i, + // Transaction ID - ISSUE_STAGE + input logic [CVA6Cfg.TRANS_ID_BITS-1:0] x_trans_id_i, + // Instruction is illegal, determined during CVXIF issue transaction - ISSUE_STAGE + input logic x_illegal_i, // Offloaded instruction - ISSUE_STAGE - input logic [ 31:0] x_off_instr_i, - // CVXIF transaction ID - ISSUE_STAGE - output logic [CVA6Cfg.TRANS_ID_BITS-1:0] x_trans_id_o, + input logic [ 31:0] x_off_instr_i, + // CVXIF is ready - ISSUE_STAGE + output logic x_ready_o, + // CVXIF result transaction ID - ISSUE_STAGE + output logic [CVA6Cfg.TRANS_ID_BITS-1:0] x_trans_id_o, // CVXIF exception - ISSUE_STAGE - output exception_t x_exception_o, + output exception_t x_exception_o, // CVXIF FU result - ISSUE_STAGE - output logic [ CVA6Cfg.XLEN-1:0] x_result_o, + output logic [ CVA6Cfg.XLEN-1:0] x_result_o, // CVXIF result valid - ISSUE_STAGE - output logic x_valid_o, + output logic x_valid_o, // CVXIF write enable - ISSUE_STAGE - output logic x_we_o, - // CVXIF request - SUBSYSTEM - output cvxif_pkg::cvxif_req_t cvxif_req_o, - // CVXIF response - SUBSYSTEM - input cvxif_pkg::cvxif_resp_t cvxif_resp_i + output logic x_we_o, + // CVXIF destination register - ISSUE_STAGE + output logic [ 4:0] x_rd_o, + // CVXIF result interface + input logic result_valid_i, + input x_result_t result_i, + output logic result_ready_o ); - localparam X_NUM_RS = ariane_pkg::NR_RGPR_PORTS; - logic illegal_n, illegal_q; - logic [CVA6Cfg.TRANS_ID_BITS-1:0] illegal_id_n, illegal_id_q; - logic [31:0] illegal_instr_n, illegal_instr_q; - logic [X_NUM_RS-1:0] rs_valid; - if (cvxif_pkg::X_NUM_RS == 3) begin : gen_third_operand - assign rs_valid = 3'b111; - end else begin : gen_no_third_operand - assign rs_valid = 2'b11; - end - always_comb begin - cvxif_req_o = '0; - cvxif_req_o.x_result_ready = 1'b1; - x_ready_o = cvxif_resp_i.x_issue_ready; - if (x_valid_i) begin - cvxif_req_o.x_issue_valid = x_valid_i; - cvxif_req_o.x_issue_req.instr = x_off_instr_i; - cvxif_req_o.x_issue_req.mode = priv_lvl_i; - cvxif_req_o.x_issue_req.id = fu_data_i.trans_id; - cvxif_req_o.x_issue_req.rs[0] = fu_data_i.operand_a; - cvxif_req_o.x_issue_req.rs[1] = fu_data_i.operand_b; - if (cvxif_pkg::X_NUM_RS == 3) begin - cvxif_req_o.x_issue_req.rs[2] = fu_data_i.imm; - end - cvxif_req_o.x_issue_req.rs_valid = rs_valid; - cvxif_req_o.x_commit_valid = x_valid_i; - cvxif_req_o.x_commit.id = fu_data_i.trans_id; - cvxif_req_o.x_commit.x_commit_kill = 1'b0; - end - end + assign result_ready_o = 1'b1; - always_comb begin - illegal_n = illegal_q; - illegal_id_n = illegal_id_q; - illegal_instr_n = illegal_instr_q; - if (~cvxif_resp_i.x_issue_resp.accept && cvxif_req_o.x_issue_valid && cvxif_resp_i.x_issue_ready && ~illegal_n) begin - illegal_n = 1'b1; - illegal_id_n = cvxif_req_o.x_issue_req.id; - illegal_instr_n = cvxif_req_o.x_issue_req.instr; - end - x_valid_o = cvxif_resp_i.x_result_valid; //Read result only when CVXIF is enabled - x_trans_id_o = x_valid_o ? cvxif_resp_i.x_result.id : '0; - x_result_o = x_valid_o ? cvxif_resp_i.x_result.data : '0; - x_exception_o.cause = x_valid_o ? {{(CVA6Cfg.XLEN-6){1'b0}}, cvxif_resp_i.x_result.exccode} : '0; - x_exception_o.valid = x_valid_o ? cvxif_resp_i.x_result.exc : '0; - x_exception_o.tval = '0; - x_exception_o.tinst = '0; - x_exception_o.tval2 = '0; - x_exception_o.gva = '0; - x_we_o = x_valid_o ? cvxif_resp_i.x_result.we : '0; - if (illegal_n) begin - if (~x_valid_o) begin - x_trans_id_o = illegal_id_n; - x_result_o = '0; - x_valid_o = 1'b1; - x_exception_o.cause = riscv::ILLEGAL_INSTR; - x_exception_o.valid = 1'b1; - if (CVA6Cfg.TvalEn) x_exception_o.tval = illegal_instr_n; - x_exception_o.tinst = '0; - x_exception_o.tval2 = '0; - x_exception_o.gva = '0; - x_we_o = '0; - illegal_n = '0; // Reset flag for illegal instr. illegal_id and illegal instr values are a don't care, no need to reset it. - end - end - end + assign x_ready_o = 1'b1; // Readyness of cvxif_fu is determined in issue stage by CVXIF issue interface + // Result signals + assign x_valid_o = x_illegal_i && x_valid_i ? 1'b1 : result_valid_i; + assign x_result_o = result_i.data; + assign x_trans_id_o = x_illegal_i ? x_trans_id_i : result_i.id; + assign x_we_o = result_i.we; + assign x_rd_o = result_i.rd; - always_ff @(posedge clk_i, negedge rst_ni) begin - if (~rst_ni) begin - illegal_q <= 1'b0; - illegal_id_q <= '0; - illegal_instr_q <= '0; - end else begin - illegal_q <= illegal_n; - illegal_id_q <= illegal_id_n; - illegal_instr_q <= illegal_instr_n; + // Handling of illegal instruction exception + always_comb begin + x_exception_o = '0; // No exception in this interface + if (x_illegal_i && x_valid_i) begin + x_exception_o.valid = '1; + x_exception_o.cause = riscv::ILLEGAL_INSTR; + if (CVA6Cfg.TvalEn) + x_exception_o.tval = x_off_instr_i; // TODO Optimization : Set exception in IRO. end end diff --git a/core/cvxif_issue_register_commit_if_driver.sv b/core/cvxif_issue_register_commit_if_driver.sv new file mode 100644 index 0000000000..2b6ab540d0 --- /dev/null +++ b/core/cvxif_issue_register_commit_if_driver.sv @@ -0,0 +1,68 @@ +// Copyright 2024 Thales DIS France SAS +// +// Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0 +// You may obtain a copy of the License at https://solderpad.org/licenses/ +// +// Original Author: Guillaume Chauvon + +module cvxif_issue_register_commit_if_driver #( + parameter config_pkg::cva6_cfg_t CVA6Cfg = config_pkg::cva6_cfg_empty, + parameter type x_issue_req_t = logic, + parameter type x_issue_resp_t = logic, + parameter type x_register_t = logic, + parameter type x_commit_t = logic +) ( + // CVA6 inputs + input logic clk_i, + input logic rst_ni, + input logic flush_i, + input logic [CVA6Cfg.XLEN-1:0] hart_id_i, + // CVXIF Issue interface + input logic issue_ready_i, + input x_issue_resp_t issue_resp_i, + output logic issue_valid_o, + output x_issue_req_t issue_req_o, + // CVXIF Register interface + input logic register_ready_i, + output logic register_valid_o, + output x_register_t register_o, + // CVXIF Commit interface + output logic commit_valid_o, + output x_commit_t commit_o, + // IRO in/out + input logic valid_i, + input logic [31:0] x_off_instr_i, + input logic [CVA6Cfg.TRANS_ID_BITS-1:0] x_trans_id_i, + input [(CVA6Cfg.NrRgprPorts/CVA6Cfg.NrIssuePorts)-1:0][CVA6Cfg.XLEN-1:0] register_i, + input logic [(CVA6Cfg.NrRgprPorts/CVA6Cfg.NrIssuePorts)-1:0] rs_valid_i, + output logic cvxif_busy_o +); + // X_ISSUE_REGISTER_SPLIT = 0 : Issue and register transactions are synchrone + // Mandatory assignement + assign register_valid_o = issue_valid_o; + assign register_o.hartid = issue_req_o.hartid; + assign register_o.id = issue_req_o.id; + // cvxif can not take any more instruction if issue transaction is still up. + assign cvxif_busy_o = issue_valid_o && ~issue_ready_i; + always_comb begin + issue_valid_o = valid_i && ~flush_i; + issue_req_o.instr = x_off_instr_i; + issue_req_o.hartid = hart_id_i; + issue_req_o.id = x_trans_id_i; + register_o.rs = register_i; + register_o.rs_valid = rs_valid_i; + end + + /* WARNING */ + // Always commit since speculation in execute in not possible : TODO to be verified + + // Always do commit transaction with issue + // If instruction goes to execute then it is not speculative + assign commit_valid_o = issue_valid_o; + assign commit_o.hartid = issue_req_o.hartid; + assign commit_o.id = issue_req_o.id; + assign commit_o.commit_kill = 1'b0; + +endmodule diff --git a/core/decoder.sv b/core/decoder.sv index 703ab7c074..65e7c22450 100644 --- a/core/decoder.sv +++ b/core/decoder.sv @@ -112,7 +112,8 @@ module decoder SBIMM, UIMM, JIMM, - RS3 + RS3, + MUX_RD_RS3 } imm_select; logic [CVA6Cfg.XLEN-1:0] imm_i_type; @@ -120,7 +121,6 @@ module decoder logic [CVA6Cfg.XLEN-1:0] imm_sb_type; logic [CVA6Cfg.XLEN-1:0] imm_u_type; logic [CVA6Cfg.XLEN-1:0] imm_uj_type; - logic [CVA6Cfg.XLEN-1:0] imm_bi_type; // --------------------------------------- // Accelerator instructions' first-pass decoder @@ -178,7 +178,6 @@ module decoder instruction_o.use_zimm = 1'b0; instruction_o.bp = branch_predict_i; instruction_o.vfp = 1'b0; - tinst = '0; ecall = 1'b0; ebreak = 1'b0; check_fprm = 1'b0; @@ -187,9 +186,9 @@ module decoder case (instr.rtype.opcode) riscv::OpcodeSystem: begin instruction_o.fu = CSR; - instruction_o.rs1[4:0] = instr.itype.rs1; - instruction_o.rs2[4:0] = instr.rtype.rs2; //TODO: needs to be checked if better way is available - instruction_o.rd[4:0] = instr.itype.rd; + instruction_o.rs1 = instr.itype.rs1; + instruction_o.rs2 = instr.rtype.rs2; //TODO: needs to be checked if better way is available + instruction_o.rd = instr.itype.rd; unique case (instr.itype.funct3) 3'b000: begin @@ -328,13 +327,13 @@ module decoder if (instr.instr[25] != 1'b0) begin instruction_o.fu = STORE; imm_select = NOIMM; - instruction_o.rs1[4:0] = instr.stype.rs1; - instruction_o.rs2[4:0] = instr.stype.rs2; + instruction_o.rs1 = instr.stype.rs1; + instruction_o.rs2 = instr.stype.rs2; end else begin instruction_o.fu = LOAD; imm_select = NOIMM; - instruction_o.rs1[4:0] = instr.itype.rs1; - instruction_o.rd[4:0] = instr.itype.rd; + instruction_o.rs1 = instr.itype.rs1; + instruction_o.rd = instr.itype.rd; end // Hypervisor load/store instructions when V=1 cause virtual instruction @@ -401,25 +400,25 @@ module decoder 3'b010: begin // CSRRS imm_select = IIMM; // this is just a read - if (instr.itype.rs1 == 5'b0) instruction_o.op = ariane_pkg::CSR_READ; + if (instr.itype.rs1 == '0) instruction_o.op = ariane_pkg::CSR_READ; else instruction_o.op = ariane_pkg::CSR_SET; end // atomically clear values in the CSR and write back to rd 3'b011: begin // CSRRC imm_select = IIMM; // this is just a read - if (instr.itype.rs1 == 5'b0) instruction_o.op = ariane_pkg::CSR_READ; + if (instr.itype.rs1 == '0) instruction_o.op = ariane_pkg::CSR_READ; else instruction_o.op = ariane_pkg::CSR_CLEAR; end // use zimm and iimm 3'b101: begin // CSRRWI - instruction_o.rs1[4:0] = instr.itype.rs1; + instruction_o.rs1 = instr.itype.rs1; imm_select = IIMM; instruction_o.use_zimm = 1'b1; instruction_o.op = ariane_pkg::CSR_WRITE; end 3'b110: begin // CSRRSI - instruction_o.rs1[4:0] = instr.itype.rs1; + instruction_o.rs1 = instr.itype.rs1; imm_select = IIMM; instruction_o.use_zimm = 1'b1; // this is just a read @@ -427,11 +426,11 @@ module decoder else instruction_o.op = ariane_pkg::CSR_SET; end 3'b111: begin // CSRRCI - instruction_o.rs1[4:0] = instr.itype.rs1; + instruction_o.rs1 = instr.itype.rs1; imm_select = IIMM; instruction_o.use_zimm = 1'b1; // this is just a read - if (instr.itype.rs1 == 5'b0) instruction_o.op = ariane_pkg::CSR_READ; + if (instr.itype.rs1 == '0) instruction_o.op = ariane_pkg::CSR_READ; else instruction_o.op = ariane_pkg::CSR_CLEAR; end default: illegal_instr = 1'b1; @@ -467,24 +466,24 @@ module decoder if (CVA6Cfg.FpPresent && CVA6Cfg.XFVec && fs_i != riscv::Off && ((CVA6Cfg.RVH && (!v_i || vfs_i != riscv::Off)) || !CVA6Cfg.RVH)) begin automatic logic allow_replication; // control honoring of replication flag - instruction_o.fu = FPU_VEC; // Same unit, but sets 'vectorial' signal - instruction_o.rs1[4:0] = instr.rvftype.rs1; - instruction_o.rs2[4:0] = instr.rvftype.rs2; - instruction_o.rd[4:0] = instr.rvftype.rd; - check_fprm = 1'b1; - allow_replication = 1'b1; + instruction_o.fu = FPU_VEC; // Same unit, but sets 'vectorial' signal + instruction_o.rs1 = instr.rvftype.rs1; + instruction_o.rs2 = instr.rvftype.rs2; + instruction_o.rd = instr.rvftype.rd; + check_fprm = 1'b1; + allow_replication = 1'b1; // decode vectorial FP instruction unique case (instr.rvftype.vecfltop) 5'b00001: begin - instruction_o.op = ariane_pkg::FADD; // vfadd.vfmt - Vectorial FP Addition - instruction_o.rs1 = '0; // Operand A is set to 0 - instruction_o.rs2[4:0] = instr.rvftype.rs1; // Operand B is set to rs1 - imm_select = IIMM; // Operand C is set to rs2 + instruction_o.op = ariane_pkg::FADD; // vfadd.vfmt - Vectorial FP Addition + instruction_o.rs1 = '0; // Operand A is set to 0 + instruction_o.rs2 = instr.rvftype.rs1; // Operand B is set to rs1 + imm_select = IIMM; // Operand C is set to rs2 end 5'b00010: begin instruction_o.op = ariane_pkg::FSUB; // vfsub.vfmt - Vectorial FP Subtraction instruction_o.rs1 = '0; // Operand A is set to 0 - instruction_o.rs2[4:0] = instr.rvftype.rs1; // Operand B is set to rs1 + instruction_o.rs2 = instr.rvftype.rs1; // Operand B is set to rs1 imm_select = IIMM; // Operand C is set to rs2 end 5'b00011: @@ -515,7 +514,7 @@ module decoder 5'b01100: begin unique case (instr.rvftype.rs2) inside // operation encoded in rs2, `inside` for matching ? 5'b00000: begin - instruction_o.rs2[4:0] = instr.rvftype.rs1; // set rs2 = rs1 so we can map FMV to SGNJ in the unit + instruction_o.rs2 = instr.rvftype.rs1; // set rs2 = rs1 so we can map FMV to SGNJ in the unit if (instr.rvftype.repl) instruction_o.op = ariane_pkg::FMV_X2F; // vfmv.vfmt.x - GPR to FPR Move else instruction_o.op = ariane_pkg::FMV_F2X; // vfmv.x.vfmt - FPR to GPR Move @@ -532,7 +531,7 @@ module decoder instruction_o.op = ariane_pkg::FCVT_I2F; // vfcvt.vfmt.x - Vectorial Int to FP Conversion 5'b001??: begin instruction_o.op = ariane_pkg::FCVT_F2F; // vfcvt.vfmt.vfmt - Vectorial FP to FP Conversion - instruction_o.rs2[4:0] = instr.rvftype.rd; // set rs2 = rd as target vector for conversion + instruction_o.rs2 = instr.rvftype.rd; // set rs2 = rd as target vector for conversion imm_select = IIMM; // rs2 holds part of the intruction // TODO CHECK R bit for valid fmt combinations // determine source format @@ -715,9 +714,9 @@ module decoder end else begin instruction_o.fu = (instr.rtype.funct7 == 7'b000_0001) ? MULT : ALU; end - instruction_o.rs1[4:0] = instr.rtype.rs1; - instruction_o.rs2[4:0] = instr.rtype.rs2; - instruction_o.rd[4:0] = instr.rtype.rd; + instruction_o.rs1 = instr.rtype.rs1; + instruction_o.rs2 = instr.rtype.rs2; + instruction_o.rd = instr.rtype.rd; unique case ({ instr.rtype.funct7, instr.rtype.funct3 @@ -818,10 +817,10 @@ module decoder // 32bit Reg-Reg Operations // -------------------------- riscv::OpcodeOp32: begin - instruction_o.fu = (instr.rtype.funct7 == 7'b000_0001) ? MULT : ALU; - instruction_o.rs1[4:0] = instr.rtype.rs1; - instruction_o.rs2[4:0] = instr.rtype.rs2; - instruction_o.rd[4:0] = instr.rtype.rd; + instruction_o.fu = (instr.rtype.funct7 == 7'b000_0001) ? MULT : ALU; + instruction_o.rs1 = instr.rtype.rs1; + instruction_o.rs2 = instr.rtype.rs2; + instruction_o.rd = instr.rtype.rd; if (CVA6Cfg.IS_XLEN64) begin unique case ({ instr.rtype.funct7, instr.rtype.funct3 @@ -874,8 +873,8 @@ module decoder riscv::OpcodeOpImm: begin instruction_o.fu = ALU; imm_select = IIMM; - instruction_o.rs1[4:0] = instr.itype.rs1; - instruction_o.rd[4:0] = instr.itype.rd; + instruction_o.rs1 = instr.itype.rs1; + instruction_o.rd = instr.itype.rd; unique case (instr.itype.funct3) 3'b000: instruction_o.op = ariane_pkg::ADD; // Add Immediate 3'b010: instruction_o.op = ariane_pkg::SLTS; // Set to one if Lower Than Immediate @@ -939,8 +938,8 @@ module decoder riscv::OpcodeOpImm32: begin instruction_o.fu = ALU; imm_select = IIMM; - instruction_o.rs1[4:0] = instr.itype.rs1; - instruction_o.rd[4:0] = instr.itype.rd; + instruction_o.rs1 = instr.itype.rs1; + instruction_o.rd = instr.itype.rd; if (CVA6Cfg.IS_XLEN64) begin unique case (instr.itype.funct3) 3'b000: instruction_o.op = ariane_pkg::ADDW; // Add Immediate @@ -988,8 +987,8 @@ module decoder riscv::OpcodeStore: begin instruction_o.fu = STORE; imm_select = SIMM; - instruction_o.rs1[4:0] = instr.stype.rs1; - instruction_o.rs2[4:0] = instr.stype.rs2; + instruction_o.rs1 = instr.stype.rs1; + instruction_o.rs2 = instr.stype.rs2; // determine store size unique case (instr.stype.funct3) 3'b000: instruction_o.op = ariane_pkg::SB; @@ -1009,8 +1008,8 @@ module decoder riscv::OpcodeLoad: begin instruction_o.fu = LOAD; imm_select = IIMM; - instruction_o.rs1[4:0] = instr.itype.rs1; - instruction_o.rd[4:0] = instr.itype.rd; + instruction_o.rs1 = instr.itype.rs1; + instruction_o.rd = instr.itype.rd; // determine load size and signed type unique case (instr.itype.funct3) 3'b000: instruction_o.op = ariane_pkg::LB; @@ -1039,8 +1038,8 @@ module decoder if (CVA6Cfg.FpPresent && fs_i != riscv::Off && ((CVA6Cfg.RVH && (!v_i || vfs_i != riscv::Off)) || !CVA6Cfg.RVH)) begin // only generate decoder if FP extensions are enabled (static) instruction_o.fu = STORE; imm_select = SIMM; - instruction_o.rs1[4:0] = instr.stype.rs1; - instruction_o.rs2[4:0] = instr.stype.rs2; + instruction_o.rs1 = instr.stype.rs1; + instruction_o.rs2 = instr.stype.rs2; // determine store size unique case (instr.stype.funct3) // Only process instruction if corresponding extension is active (static) @@ -1069,8 +1068,8 @@ module decoder if (CVA6Cfg.FpPresent && fs_i != riscv::Off && ((CVA6Cfg.RVH && (!v_i || vfs_i != riscv::Off)) || !CVA6Cfg.RVH)) begin // only generate decoder if FP extensions are enabled (static) instruction_o.fu = LOAD; imm_select = IIMM; - instruction_o.rs1[4:0] = instr.itype.rs1; - instruction_o.rd[4:0] = instr.itype.rd; + instruction_o.rs1 = instr.itype.rs1; + instruction_o.rd = instr.itype.rd; // determine load size unique case (instr.itype.funct3) // Only process instruction if corresponding extension is active (static) @@ -1100,12 +1099,12 @@ module decoder // ---------------------------------- riscv::OpcodeMadd, riscv::OpcodeMsub, riscv::OpcodeNmsub, riscv::OpcodeNmadd: begin if (CVA6Cfg.FpPresent && fs_i != riscv::Off && ((CVA6Cfg.RVH && (!v_i || vfs_i != riscv::Off)) || !CVA6Cfg.RVH)) begin // only generate decoder if FP extensions are enabled (static) - instruction_o.fu = FPU; - instruction_o.rs1[4:0] = instr.r4type.rs1; - instruction_o.rs2[4:0] = instr.r4type.rs2; - instruction_o.rd[4:0] = instr.r4type.rd; - imm_select = RS3; // rs3 into result field - check_fprm = 1'b1; + instruction_o.fu = FPU; + instruction_o.rs1 = instr.r4type.rs1; + instruction_o.rs2 = instr.r4type.rs2; + instruction_o.rd = instr.r4type.rd; + imm_select = RS3; // rs3 into result field + check_fprm = 1'b1; // select the correct fused operation unique case (instr.r4type.opcode) default: instruction_o.op = ariane_pkg::FMADD; // fmadd.fmt - FP Fused multiply-add @@ -1155,24 +1154,24 @@ module decoder riscv::OpcodeOpFp: begin if (CVA6Cfg.FpPresent && fs_i != riscv::Off && ((CVA6Cfg.RVH && (!v_i || vfs_i != riscv::Off)) || !CVA6Cfg.RVH)) begin // only generate decoder if FP extensions are enabled (static) - instruction_o.fu = FPU; - instruction_o.rs1[4:0] = instr.rftype.rs1; - instruction_o.rs2[4:0] = instr.rftype.rs2; - instruction_o.rd[4:0] = instr.rftype.rd; - check_fprm = 1'b1; + instruction_o.fu = FPU; + instruction_o.rs1 = instr.rftype.rs1; + instruction_o.rs2 = instr.rftype.rs2; + instruction_o.rd = instr.rftype.rd; + check_fprm = 1'b1; // decode FP instruction unique case (instr.rftype.funct5) 5'b00000: begin - instruction_o.op = ariane_pkg::FADD; // fadd.fmt - FP Addition - instruction_o.rs1 = '0; // Operand A is set to 0 - instruction_o.rs2[4:0] = instr.rftype.rs1; // Operand B is set to rs1 - imm_select = IIMM; // Operand C is set to rs2 + instruction_o.op = ariane_pkg::FADD; // fadd.fmt - FP Addition + instruction_o.rs1 = '0; // Operand A is set to 0 + instruction_o.rs2 = instr.rftype.rs1; // Operand B is set to rs1 + imm_select = IIMM; // Operand C is set to rs2 end 5'b00001: begin - instruction_o.op = ariane_pkg::FSUB; // fsub.fmt - FP Subtraction - instruction_o.rs1 = '0; // Operand A is set to 0 - instruction_o.rs2[4:0] = instr.rftype.rs1; // Operand B is set to rs1 - imm_select = IIMM; // Operand C is set to rs2 + instruction_o.op = ariane_pkg::FSUB; // fsub.fmt - FP Subtraction + instruction_o.rs1 = '0; // Operand A is set to 0 + instruction_o.rs2 = instr.rftype.rs1; // Operand B is set to rs1 + imm_select = IIMM; // Operand C is set to rs2 end 5'b00010: instruction_o.op = ariane_pkg::FMUL; // fmul.fmt - FP Multiplication 5'b00011: instruction_o.op = ariane_pkg::FDIV; // fdiv.fmt - FP Division @@ -1203,7 +1202,7 @@ module decoder end 5'b01000: begin instruction_o.op = ariane_pkg::FCVT_F2F; // fcvt.fmt.fmt - FP to FP Conversion - instruction_o.rs2[4:0] = instr.rvftype.rs1; // tie rs2 to rs1 to be safe (vectors use rs2) + instruction_o.rs2 = instr.rvftype.rs1; // tie rs2 to rs1 to be safe (vectors use rs2) imm_select = IIMM; // rs2 holds part of the intruction if (|instr.rftype.rs2[24:23]) illegal_instr = 1'b1; // bits [22:20] used, other bits must be 0 @@ -1241,7 +1240,7 @@ module decoder illegal_instr = 1'b1; // bits [21:20] used, other bits must be 0 end 5'b11100: begin - instruction_o.rs2[4:0] = instr.rftype.rs1; // set rs2 = rs1 so we can map FMV to SGNJ in the unit + instruction_o.rs2 = instr.rftype.rs1; // set rs2 = rs1 so we can map FMV to SGNJ in the unit check_fprm = 1'b0; // instruction encoded in rm, do the check here if (instr.rftype.rm == 3'b000 || (CVA6Cfg.XF16ALT && instr.rftype.rm == 3'b100)) // FP16ALT has separate encoding instruction_o.op = ariane_pkg::FMV_F2X; // fmv.ifmt.fmt - FPR to GPR Move @@ -1253,7 +1252,7 @@ module decoder end 5'b11110: begin instruction_o.op = ariane_pkg::FMV_X2F; // fmv.fmt.ifmt - GPR to FPR Move - instruction_o.rs2[4:0] = instr.rftype.rs1; // set rs2 = rs1 so we can map FMV to SGNJ in the unit + instruction_o.rs2 = instr.rftype.rs1; // set rs2 = rs1 so we can map FMV to SGNJ in the unit check_fprm = 1'b0; // instruction encoded in rm, do the check here if (!(instr.rftype.rm == 3'b000 || (CVA6Cfg.XF16ALT && instr.rftype.rm == 3'b100))) illegal_instr = 1'b1; @@ -1304,10 +1303,10 @@ module decoder // ---------------------------------- riscv::OpcodeAmo: begin // we are going to use the load unit for AMOs - instruction_o.fu = STORE; - instruction_o.rs1[4:0] = instr.atype.rs1; - instruction_o.rs2[4:0] = instr.atype.rs2; - instruction_o.rd[4:0] = instr.atype.rd; + instruction_o.fu = STORE; + instruction_o.rs1 = instr.atype.rs1; + instruction_o.rs2 = instr.atype.rs2; + instruction_o.rd = instr.atype.rd; // TODO(zarubaf): Ordering // words if (CVA6Cfg.RVA && instr.stype.funct3 == 3'h2) begin @@ -1350,16 +1349,18 @@ module decoder end else begin illegal_instr = 1'b1; end - tinst = { - instr.atype.funct5, - instr.atype.aq, - instr.atype.rl, - instr.atype.rs2, - 5'b0, - instr.atype.funct3, - instr.atype.rd, - instr.atype.opcode - }; + if (CVA6Cfg.RVH) begin + tinst = { + instr.atype.funct5, + instr.atype.aq, + instr.atype.rl, + instr.atype.rs2, + 5'b0, + instr.atype.funct3, + instr.atype.rd, + instr.atype.opcode + }; + end end // -------------------------------- @@ -1368,8 +1369,8 @@ module decoder riscv::OpcodeBranch: begin imm_select = SBIMM; instruction_o.fu = CTRL_FLOW; - instruction_o.rs1[4:0] = instr.stype.rs1; - instruction_o.rs2[4:0] = instr.stype.rs2; + instruction_o.rs1 = instr.stype.rs1; + instruction_o.rs2 = instr.stype.rs2; is_control_flow_instr_o = 1'b1; @@ -1390,9 +1391,9 @@ module decoder riscv::OpcodeJalr: begin instruction_o.fu = CTRL_FLOW; instruction_o.op = ariane_pkg::JALR; - instruction_o.rs1[4:0] = instr.itype.rs1; + instruction_o.rs1 = instr.itype.rs1; imm_select = IIMM; - instruction_o.rd[4:0] = instr.itype.rd; + instruction_o.rd = instr.itype.rd; is_control_flow_instr_o = 1'b1; // invalid jump and link register -> reserved for vector encoding if (instr.itype.funct3 != 3'b0) illegal_instr = 1'b1; @@ -1401,34 +1402,37 @@ module decoder riscv::OpcodeJal: begin instruction_o.fu = CTRL_FLOW; imm_select = JIMM; - instruction_o.rd[4:0] = instr.utype.rd; + instruction_o.rd = instr.utype.rd; is_control_flow_instr_o = 1'b1; end riscv::OpcodeAuipc: begin - instruction_o.fu = ALU; - imm_select = UIMM; - instruction_o.use_pc = 1'b1; - instruction_o.rd[4:0] = instr.utype.rd; + instruction_o.fu = ALU; + imm_select = UIMM; + instruction_o.use_pc = 1'b1; + instruction_o.rd = instr.utype.rd; end riscv::OpcodeLui: begin - imm_select = UIMM; - instruction_o.fu = ALU; - instruction_o.rd[4:0] = instr.utype.rd; + imm_select = UIMM; + instruction_o.fu = ALU; + instruction_o.rd = instr.utype.rd; end default: illegal_instr = 1'b1; endcase end if (CVA6Cfg.CvxifEn) begin - if (is_illegal_i || illegal_instr) begin - instruction_o.fu = CVXIF; - instruction_o.rs1[4:0] = instr.r4type.rs1; - instruction_o.rs2[4:0] = instr.r4type.rs2; - instruction_o.rd[4:0] = instr.r4type.rd; - instruction_o.op = ariane_pkg::OFFLOAD; - imm_select = RS3; + if (~ex_i.valid && (is_illegal_i || illegal_instr)) begin + instruction_o.fu = CVXIF; + instruction_o.rs1 = instr.r4type.rs1; + instruction_o.rs2 = instr.r4type.rs2; + instruction_o.rd = instr.r4type.rd; + instruction_o.op = ariane_pkg::OFFLOAD; + imm_select = instr.rtype.opcode == riscv::OpcodeMadd || + instr.rtype.opcode == riscv::OpcodeMsub || + instr.rtype.opcode == riscv::OpcodeNmadd || + instr.rtype.opcode == riscv::OpcodeNmsub ? RS3 : MUX_RD_RS3; end end @@ -1474,9 +1478,8 @@ module decoder instruction_i[30:21], 1'b0 }; - imm_bi_type = {{CVA6Cfg.XLEN - 5{instruction_i[24]}}, instruction_i[24:20]}; - // NOIMM, IIMM, SIMM, BIMM, UIMM, JIMM, RS3 + // NOIMM, IIMM, SIMM, SBIMM, UIMM, JIMM, RS3 // select immediate case (imm_select) IIMM: begin @@ -1504,6 +1507,11 @@ module decoder instruction_o.result = {{CVA6Cfg.XLEN - 5{1'b0}}, instr.r4type.rs3}; instruction_o.use_imm = 1'b0; end + MUX_RD_RS3: begin + // result holds address of operand rs3 which is in rd field + instruction_o.result = {{CVA6Cfg.XLEN - 5{1'b0}}, instr.rtype.rd}; + instruction_o.use_imm = 1'b0; + end default: begin instruction_o.result = {CVA6Cfg.XLEN{1'b0}}; instruction_o.use_imm = 1'b0; diff --git a/core/ex_stage.sv b/core/ex_stage.sv index d44aa5ea05..115c17f1c5 100644 --- a/core/ex_stage.sv +++ b/core/ex_stage.sv @@ -28,7 +28,8 @@ module ex_stage parameter type icache_arsp_t = logic, parameter type icache_dreq_t = logic, parameter type icache_drsp_t = logic, - parameter type lsu_ctrl_t = logic + parameter type lsu_ctrl_t = logic, + parameter type x_result_t = logic ) ( // Subsystem Clock - SUBSYSTEM input logic clk_i, @@ -136,7 +137,7 @@ module ex_stage input logic [CVA6Cfg.NrIssuePorts-1:0] x_valid_i, // CVXIF is ready - ISSUE_STAGE output logic x_ready_o, - // undecoded instruction - ISSUE_STAGE + // CVXIF undecoded instruction input logic [31:0] x_off_instr_i, // CVXIF transaction ID - ISSUE_STAGE output logic [CVA6Cfg.TRANS_ID_BITS-1:0] x_trans_id_o, @@ -148,10 +149,14 @@ module ex_stage output logic x_valid_o, // CVXIF write enable - ISSUE_STAGE output logic x_we_o, - // CVXIF request - SUBSYSTEM - output cvxif_pkg::cvxif_req_t cvxif_req_o, - // CVXIF response - SUBSYSTEM - input cvxif_pkg::cvxif_resp_t cvxif_resp_i, + // CVXIF destination register - ISSUE_STAGE + output logic [4:0] x_rd_o, + // CVXIF Result interface - SUBSYSTEM + input logic x_result_valid_i, + input x_result_t x_result_i, + output logic x_result_ready_o, + // CVXIF Issue transaction rejected -> illegal instruction - ISSUE_STAGE + input logic x_transaction_rejected_i, // accelerate port result is valid - ACC_DISPATCHER input logic acc_valid_i, // Enable virtual memory translation - CSR_REGFILE @@ -217,9 +222,9 @@ module ex_stage // To count the data TLB misses - PERF_COUNTERS output logic dtlb_miss_o, // Report the PMP configuration - CSR_REGFILE - input riscv::pmpcfg_t [CVA6Cfg.NrPMPEntries:0] pmpcfg_i, + input riscv::pmpcfg_t [CVA6Cfg.NrPMPEntries-1:0] pmpcfg_i, // Report the PMP addresses - CSR_REGFILE - input logic [CVA6Cfg.NrPMPEntries:0][CVA6Cfg.PLEN-3:0] pmpaddr_i, + input logic [CVA6Cfg.NrPMPEntries-1:0][CVA6Cfg.PLEN-3:0] pmpaddr_i, // Information dedicated to RVFI - RVFI output lsu_ctrl_t rvfi_lsu_ctrl_o, // Information dedicated to RVFI - RVFI @@ -269,13 +274,19 @@ module ex_stage assign one_cycle_select = alu_valid_i | branch_valid_i | csr_valid_i; fu_data_t one_cycle_data; + logic [CVA6Cfg.VLEN-1:0] rs1_forwarding; + logic [CVA6Cfg.VLEN-1:0] rs2_forwarding; always_comb begin // data silence operation one_cycle_data = one_cycle_select[0] ? fu_data_i[0] : '0; + rs1_forwarding = rs1_forwarding_i[0]; + rs2_forwarding = rs2_forwarding_i[0]; if (CVA6Cfg.SuperscalarEn) begin if (one_cycle_select[1]) begin one_cycle_data = fu_data_i[1]; + rs1_forwarding = rs1_forwarding_i[1]; + rs2_forwarding = rs2_forwarding_i[1]; end end end @@ -283,6 +294,7 @@ module ex_stage // 1. ALU (combinatorial) alu #( .CVA6Cfg (CVA6Cfg), + .HasBranch(1'b1), .fu_data_t(fu_data_t) ) alu_i ( .clk_i, @@ -449,6 +461,7 @@ module ex_stage alu #( .CVA6Cfg (CVA6Cfg), + .HasBranch(1'b0), .fu_data_t(fu_data_t) ) alu2_i ( .clk_i, @@ -592,29 +605,31 @@ module ex_stage cvxif_fu #( .CVA6Cfg(CVA6Cfg), .exception_t(exception_t), - .fu_data_t(fu_data_t) + .x_result_t(x_result_t) ) cvxif_fu_i ( .clk_i, .rst_ni, - .fu_data_i (cvxif_data), - .priv_lvl_i(ld_st_priv_lvl_i), - .x_valid_i (|x_valid_i), - .x_ready_o, + .x_valid_i(|x_valid_i), + .x_trans_id_i(cvxif_data.trans_id), + .x_illegal_i(x_transaction_rejected_i), .x_off_instr_i, + .x_ready_o, .x_trans_id_o, .x_exception_o, .x_result_o, .x_valid_o, .x_we_o, - .cvxif_req_o, - .cvxif_resp_i + .x_rd_o, + .result_valid_i(x_result_valid_i), + .result_i(x_result_i), + .result_ready_o(x_result_ready_o) ); end else begin : gen_no_cvxif - assign cvxif_req_o = '0; - assign x_trans_id_o = '0; - assign x_exception_o = '0; - assign x_result_o = '0; - assign x_valid_o = '0; + assign x_result_ready_o = '0; + assign x_trans_id_o = '0; + assign x_exception_o = '0; + assign x_result_o = '0; + assign x_valid_o = '0; end if (CVA6Cfg.RVS) begin @@ -664,10 +679,10 @@ module ex_stage gpaddr_to_be_flushed <= '0; // if the current instruction in EX_STAGE is a sfence.vma, in the next cycle no writes will happen end else if ((~(current_instruction_is_sfence_vma || current_instruction_is_hfence_vvma || current_instruction_is_hfence_gvma)) && (~((fu_data_i[0].operation == SFENCE_VMA || fu_data_i[0].operation == HFENCE_VVMA || fu_data_i[0].operation == HFENCE_GVMA ) && |csr_valid_i))) begin - vaddr_to_be_flushed <= rs1_forwarding_i; - gpaddr_to_be_flushed <= {2'b00, rs1_forwarding_i[CVA6Cfg.GPLEN-1:2]}; - asid_to_be_flushed <= rs2_forwarding_i[CVA6Cfg.ASID_WIDTH-1:0]; - vmid_to_be_flushed <= rs2_forwarding_i[CVA6Cfg.VMID_WIDTH-1:0]; + vaddr_to_be_flushed <= rs1_forwarding; + gpaddr_to_be_flushed <= {2'b00, rs1_forwarding[CVA6Cfg.GPLEN-1:2]}; + asid_to_be_flushed <= rs2_forwarding[CVA6Cfg.ASID_WIDTH-1:0]; + vmid_to_be_flushed <= rs2_forwarding[CVA6Cfg.VMID_WIDTH-1:0]; end end end else begin @@ -680,8 +695,8 @@ module ex_stage vaddr_to_be_flushed <= '0; // if the current instruction in EX_STAGE is a sfence.vma, in the next cycle no writes will happen end else if ((~current_instruction_is_sfence_vma) && (~((fu_data_i[0].operation == SFENCE_VMA) && |csr_valid_i))) begin - vaddr_to_be_flushed <= rs1_forwarding_i; - asid_to_be_flushed <= rs2_forwarding_i[CVA6Cfg.ASID_WIDTH-1:0]; + vaddr_to_be_flushed <= rs1_forwarding; + asid_to_be_flushed <= rs2_forwarding[CVA6Cfg.ASID_WIDTH-1:0]; end end end diff --git a/core/frontend/bht.sv b/core/frontend/bht.sv index 1057c2cf44..e95c536770 100644 --- a/core/frontend/bht.sv +++ b/core/frontend/bht.sv @@ -1,5 +1,6 @@ // Copyright 2018 - 2019 ETH Zurich and University of Bologna. -// Copyright 2023 - Thales for additionnal conribution. +// Copyright 2023 - Thales for additionnal contribution. +// Copyright 2024 - PlanV Technologies for additionnal contribution. // Copyright and related rights are licensed under the Solderpad Hardware // License, Version 2.0 (the "License"); you may not use this file except in // compliance with the License. You may obtain a copy of the License at @@ -15,6 +16,8 @@ // Date: 09.06.2018 // FPGA optimization: Sebastien Jacq, Thales // Date: 2023-01-30 +// FPGA optimization for Altera: Angela Gonzalez, PlanV Technolgies +// Date: 2024-10-16 // branch history table - 2 bit saturation counter @@ -47,8 +50,6 @@ module bht #( localparam ROW_INDEX_BITS = CVA6Cfg.RVC == 1'b1 ? $clog2(CVA6Cfg.INSTR_PER_FETCH) : 1; // number of bits we should use for prediction localparam PREDICTION_BITS = $clog2(NR_ROWS) + OFFSET + ROW_ADDR_BITS; - // we are not interested in all bits of the address - unread i_unread (.d_i(|vpc_i)); struct packed { logic valid; @@ -58,7 +59,7 @@ module bht #( bht_q[NR_ROWS-1:0][CVA6Cfg.INSTR_PER_FETCH-1:0]; logic [$clog2(NR_ROWS)-1:0] index, update_pc; - logic [ROW_INDEX_BITS-1:0] update_row_index; + logic [ROW_INDEX_BITS-1:0] update_row_index, update_row_index_q, check_update_row_index; assign index = vpc_i[PREDICTION_BITS-1:ROW_ADDR_BITS+OFFSET]; assign update_pc = bht_update_i.pc[PREDICTION_BITS-1:ROW_ADDR_BITS+OFFSET]; @@ -127,17 +128,23 @@ module bht #( // number of bits par word in the bram localparam BRAM_WORD_BITS = $bits(ariane_pkg::bht_t); - logic [ ROW_INDEX_BITS-1:0] row_index; - logic [ CVA6Cfg.INSTR_PER_FETCH-1:0] bht_ram_we; - logic [CVA6Cfg.INSTR_PER_FETCH*$clog2(NR_ROWS)-1:0] bht_ram_read_address_0; - logic [CVA6Cfg.INSTR_PER_FETCH*$clog2(NR_ROWS)-1:0] bht_ram_read_address_1; - logic [CVA6Cfg.INSTR_PER_FETCH*$clog2(NR_ROWS)-1:0] bht_ram_write_address; - logic [ CVA6Cfg.INSTR_PER_FETCH*BRAM_WORD_BITS-1:0] bht_ram_wdata; - logic [ CVA6Cfg.INSTR_PER_FETCH*BRAM_WORD_BITS-1:0] bht_ram_rdata_0; - logic [ CVA6Cfg.INSTR_PER_FETCH*BRAM_WORD_BITS-1:0] bht_ram_rdata_1; - - ariane_pkg::bht_t [ CVA6Cfg.INSTR_PER_FETCH-1:0] bht; - ariane_pkg::bht_t [ CVA6Cfg.INSTR_PER_FETCH-1:0] bht_updated; + logic [ROW_INDEX_BITS-1:0] row_index, row_index_q, check_row_index; + logic [CVA6Cfg.INSTR_PER_FETCH-1:0] bht_ram_we, bht_ram_we_q; + logic [CVA6Cfg.INSTR_PER_FETCH*$clog2(NR_ROWS)-1:0] bht_ram_read_address_0; + logic [CVA6Cfg.INSTR_PER_FETCH*$clog2(NR_ROWS)-1:0] bht_ram_read_address_1; + logic [CVA6Cfg.INSTR_PER_FETCH*$clog2(NR_ROWS)-1:0] + bht_ram_write_address, bht_ram_write_address_q; + logic [CVA6Cfg.INSTR_PER_FETCH*BRAM_WORD_BITS-1:0] bht_ram_wdata, bht_ram_wdata_q; + logic [CVA6Cfg.INSTR_PER_FETCH*BRAM_WORD_BITS-1:0] bht_ram_rdata_0; + logic [CVA6Cfg.INSTR_PER_FETCH*BRAM_WORD_BITS-1:0] bht_ram_rdata_1; + + ariane_pkg::bht_t [CVA6Cfg.INSTR_PER_FETCH-1:0] bht; + ariane_pkg::bht_t [CVA6Cfg.INSTR_PER_FETCH-1:0] bht_updated; + + logic [CVA6Cfg.INSTR_PER_FETCH-1:0][1:0] bht_updated_valid; + logic [CVA6Cfg.INSTR_PER_FETCH-1:0][1:0][CVA6Cfg.VLEN-1:0] bht_updated_pc; + logic bht_update_taken, check_bht_update_taken; + logic [CVA6Cfg.VLEN-1:0] vpc_q; if (CVA6Cfg.RVC) begin : gen_row_index assign row_index = vpc_i[ROW_ADDR_BITS+OFFSET-1:OFFSET]; @@ -157,66 +164,150 @@ module bht #( bht_updated = '0; bht = '0; - for (int i = 0; i < CVA6Cfg.INSTR_PER_FETCH; i++) begin - if (row_index == i) begin - bht_ram_read_address_0[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)] = index; - bht_prediction_o[i].valid = bht_ram_rdata_0[i*BRAM_WORD_BITS+2]; - bht_prediction_o[i].taken = bht_ram_rdata_0[i*BRAM_WORD_BITS+1]; - end - end - + //Write to RAM if (bht_update_i.valid && !debug_mode_i) begin for (int i = 0; i < CVA6Cfg.INSTR_PER_FETCH; i++) begin if (update_row_index == i) begin - bht_ram_read_address_1[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)] = update_pc; - bht[i].saturation_counter = bht_ram_rdata_1[i*BRAM_WORD_BITS+:2]; - - if (bht[i].saturation_counter == 2'b11) begin - // we can safely decrease it - if (!bht_update_i.taken) - bht_updated[i].saturation_counter = bht[i].saturation_counter - 1; - else bht_updated[i].saturation_counter = 2'b11; - // then check if it saturated in the negative regime e.g.: branch not taken - end else if (bht[i].saturation_counter == 2'b00) begin - // we can safely increase it - if (bht_update_i.taken) - bht_updated[i].saturation_counter = bht[i].saturation_counter + 1; - else bht_updated[i].saturation_counter = 2'b00; - end else begin // otherwise we are not in any boundaries and can decrease or increase it - if (bht_update_i.taken) - bht_updated[i].saturation_counter = bht[i].saturation_counter + 1; - else bht_updated[i].saturation_counter = bht[i].saturation_counter - 1; - end - bht_updated[i].valid = 1'b1; bht_ram_we[i] = 1'b1; bht_ram_write_address[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)] = update_pc; - //bht_ram_wdata[(i+1)*BRAM_WORD_BITS-1] = 1'b1; //valid - bht_ram_wdata[i*BRAM_WORD_BITS+:BRAM_WORD_BITS] = { - bht_updated[i].valid, bht_updated[i].saturation_counter - }; + end + end + end + + for (int i = 0; i < CVA6Cfg.INSTR_PER_FETCH; i++) begin + //When synchronous RAM is used, addresses are needed as soon as available + if (CVA6Cfg.FpgaAlteraEn) + bht_ram_read_address_0[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)] = index; + if (CVA6Cfg.FpgaAlteraEn) + bht_ram_read_address_1[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)] = update_pc; + + if (check_update_row_index == i) begin + //When asynchronous RAM is used, the address can be updated on the cycle when data is read + if (!CVA6Cfg.FpgaAlteraEn) + bht_ram_read_address_1[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)] = update_pc; + bht[i].saturation_counter = bht_ram_rdata_1[i*BRAM_WORD_BITS+:2]; + + if (bht[i].saturation_counter == 2'b11) begin + // we can safely decrease it + if (!check_bht_update_taken) + bht_updated[i].saturation_counter = bht[i].saturation_counter - 1; + else bht_updated[i].saturation_counter = 2'b11; + // then check if it saturated in the negative regime e.g.: branch not taken + end else if (bht[i].saturation_counter == 2'b00) begin + // we can safely increase it + if (check_bht_update_taken) + bht_updated[i].saturation_counter = bht[i].saturation_counter + 1; + else bht_updated[i].saturation_counter = 2'b00; + end else begin // otherwise we are not in any boundaries and can decrease or increase it + if (check_bht_update_taken) + bht_updated[i].saturation_counter = bht[i].saturation_counter + 1; + else bht_updated[i].saturation_counter = bht[i].saturation_counter - 1; + end + + //The data written in the RAM will have the valid bit from current input (async RAM) or the one from one clock cycle before (sync RAM) + bht_ram_wdata[i*BRAM_WORD_BITS+:BRAM_WORD_BITS] = CVA6Cfg.FpgaAlteraEn ? {bht_updated_valid[i][0], bht_updated[i].saturation_counter} : + {bht_updated[i].valid, bht_updated[i].saturation_counter}; + end + + + if (!rst_ni) begin + //initialize output + bht_prediction_o[i] = '0; + end else begin + //When asynchronous RAM is used, addresses can be calculated on the same cycle as data is read + if (!CVA6Cfg.FpgaAlteraEn) + bht_ram_read_address_0[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)] = index; + //When synchronous RAM is used and data is read right after writing, we need some buffering + // This is one cycle of buffering + if (CVA6Cfg.FpgaAlteraEn && bht_updated_valid[i][0] && vpc_q == bht_updated_pc[i][0]) begin + bht_prediction_o[i].valid = bht_ram_wdata[i*BRAM_WORD_BITS+2]; + bht_prediction_o[i].taken = bht_ram_wdata[i*BRAM_WORD_BITS+1]; + //This is two cycles of buffering + end else if (CVA6Cfg.FpgaAlteraEn && bht_updated_valid[i][1] && vpc_q == bht_updated_pc[i][1]) begin + bht_prediction_o[i].valid = bht_ram_wdata_q[i*BRAM_WORD_BITS+2]; + bht_prediction_o[i].taken = bht_ram_wdata_q[i*BRAM_WORD_BITS+1]; + //In any other case we can safely read from the RAM as data is available + end else begin + bht_prediction_o[i].valid = bht_ram_rdata_0[i*BRAM_WORD_BITS+2]; + bht_prediction_o[i].taken = bht_ram_rdata_0[i*BRAM_WORD_BITS+1]; end end end end for (genvar i = 0; i < CVA6Cfg.INSTR_PER_FETCH; i++) begin : gen_bht_ram - AsyncThreePortRam #( - .ADDR_WIDTH($clog2(NR_ROWS)), - .DATA_DEPTH(NR_ROWS), - .DATA_WIDTH(BRAM_WORD_BITS) - ) i_bht_ram ( - .Clk_CI (clk_i), - .WrEn_SI (bht_ram_we[i]), - .WrAddr_DI (bht_ram_write_address[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)]), - .WrData_DI (bht_ram_wdata[i*BRAM_WORD_BITS+:BRAM_WORD_BITS]), - .RdAddr_DI_0(bht_ram_read_address_0[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)]), - .RdAddr_DI_1(bht_ram_read_address_1[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)]), - .RdData_DO_0(bht_ram_rdata_0[i*BRAM_WORD_BITS+:BRAM_WORD_BITS]), - .RdData_DO_1(bht_ram_rdata_1[i*BRAM_WORD_BITS+:BRAM_WORD_BITS]) - ); + if (CVA6Cfg.FpgaAlteraEn) begin + SyncThreePortRam #( + .ADDR_WIDTH($clog2(NR_ROWS)), + .DATA_DEPTH(NR_ROWS), + .DATA_WIDTH(BRAM_WORD_BITS) + ) i_bht_ram ( + .Clk_CI (clk_i), + .WrEn_SI (bht_ram_we_q[i]), + .WrAddr_DI (bht_ram_write_address_q[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)]), + .WrData_DI (bht_ram_wdata[i*BRAM_WORD_BITS+:BRAM_WORD_BITS]), + .RdAddr_DI_0(bht_ram_read_address_0[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)]), + .RdAddr_DI_1(bht_ram_read_address_1[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)]), + .RdData_DO_0(bht_ram_rdata_0[i*BRAM_WORD_BITS+:BRAM_WORD_BITS]), + .RdData_DO_1(bht_ram_rdata_1[i*BRAM_WORD_BITS+:BRAM_WORD_BITS]) + ); + + end else begin + AsyncThreePortRam #( + .ADDR_WIDTH($clog2(NR_ROWS)), + .DATA_DEPTH(NR_ROWS), + .DATA_WIDTH(BRAM_WORD_BITS) + ) i_bht_ram ( + .Clk_CI (clk_i), + .WrEn_SI (bht_ram_we[i]), + .WrAddr_DI (bht_ram_write_address[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)]), + .WrData_DI (bht_ram_wdata[i*BRAM_WORD_BITS+:BRAM_WORD_BITS]), + .RdAddr_DI_0(bht_ram_read_address_0[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)]), + .RdAddr_DI_1(bht_ram_read_address_1[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)]), + .RdData_DO_0(bht_ram_rdata_0[i*BRAM_WORD_BITS+:BRAM_WORD_BITS]), + .RdData_DO_1(bht_ram_rdata_1[i*BRAM_WORD_BITS+:BRAM_WORD_BITS]) + ); + end end + // Extra buffering signals needed when synchronous RAM is used + + always_ff @(posedge clk_i or negedge rst_ni) begin + if (CVA6Cfg.FpgaAlteraEn) begin + if (!rst_ni) begin + bht_updated_valid <= '0; + bht_update_taken <= '0; + bht_ram_wdata_q <= '0; + row_index_q <= '0; + bht_ram_we_q <= '0; + bht_ram_write_address_q <= '0; + update_row_index_q <= '0; + end else begin + for (int i = 0; i < CVA6Cfg.INSTR_PER_FETCH; i++) begin + bht_updated_valid[i][1] <= bht_updated_valid[i][0]; + bht_updated_valid[i][0] <= bht_updated[i].valid; + bht_updated_pc[i][1] <= bht_updated_pc[i][0]; + bht_updated_pc[i][0] <= bht_update_i.pc; + + end + vpc_q <= vpc_i; + bht_update_taken <= bht_update_i.taken; + bht_ram_wdata_q <= bht_ram_wdata; + bht_ram_we_q <= bht_ram_we; + bht_ram_write_address_q <= bht_ram_write_address; + update_row_index_q <= update_row_index; + + row_index_q <= row_index; + end + end + end + + // Assignment of indexes checked to generate data written in the RAM. When synchronous RAM is used these signals need to be delayed + assign check_update_row_index = CVA6Cfg.FpgaAlteraEn ? update_row_index_q : update_row_index; + assign check_bht_update_taken = CVA6Cfg.FpgaAlteraEn ? bht_update_taken : bht_update_i.taken; + assign check_row_index = CVA6Cfg.FpgaAlteraEn ? row_index_q : row_index; + end endmodule diff --git a/core/frontend/btb.sv b/core/frontend/btb.sv index 24f9e59e98..139f6558c7 100644 --- a/core/frontend/btb.sv +++ b/core/frontend/btb.sv @@ -90,7 +90,7 @@ module btb #( for (genvar i = 0; i < CVA6Cfg.INSTR_PER_FETCH; i++) begin : gen_btb_output assign btb_ram_csel_prediction[i] = 1'b1; assign btb_ram_we_prediction[i] = 1'b0; - assign btb_ram_wdata_prediction = '0; + assign btb_ram_wdata_prediction[i*BRAM_WORD_BITS+:BRAM_WORD_BITS] = '0; assign btb_ram_addr_prediction[i*$clog2(NR_ROWS)+:$clog2(NR_ROWS)] = index; assign btb_prediction_o[i] = btb_ram_rdata_prediction[i*BRAM_WORD_BITS+:BRAM_WORD_BITS]; end diff --git a/core/frontend/frontend.sv b/core/frontend/frontend.sv index e59da0ee36..3117e27654 100644 --- a/core/frontend/frontend.sv +++ b/core/frontend/frontend.sv @@ -140,6 +140,7 @@ module frontend btb_prediction_t [CVA6Cfg.INSTR_PER_FETCH-1:0] btb_prediction_shifted; ras_t ras_predict; logic [ CVA6Cfg.VLEN-1:0] vpc_btb; + logic [ CVA6Cfg.VLEN-1:0] vpc_bht; // branch-predict update logic is_mispredict; @@ -484,7 +485,9 @@ module frontend //For FPGA, BTB is implemented in read synchronous BRAM //while for ASIC, BTB is implemented in D flip-flop //and can be read at the same cycle. + //Same for BHT assign vpc_btb = (CVA6Cfg.FpgaEn) ? icache_dreq_i.vaddr : icache_vaddr_q; + assign vpc_bht = (CVA6Cfg.FpgaEn && CVA6Cfg.FpgaAlteraEn && icache_dreq_i.valid) ? icache_dreq_i.vaddr : icache_vaddr_q; if (CVA6Cfg.BTBEntries == 0) begin assign btb_prediction = '0; @@ -517,7 +520,7 @@ module frontend .rst_ni, .flush_bp_i (flush_bp_i), .debug_mode_i, - .vpc_i (icache_vaddr_q), + .vpc_i (vpc_bht), .bht_update_i (bht_update), .bht_prediction_o(bht_prediction) ); diff --git a/core/frontend/instr_queue.sv b/core/frontend/instr_queue.sv index 4827a9364f..ceac06bbe2 100644 --- a/core/frontend/instr_queue.sv +++ b/core/frontend/instr_queue.sv @@ -103,23 +103,18 @@ module instr_queue logic [CVA6Cfg.LOG2_INSTR_PER_FETCH-1:0] branch_index; // instruction queues - logic [CVA6Cfg.INSTR_PER_FETCH-1:0][$clog2( -ariane_pkg::FETCH_FIFO_DEPTH -)-1:0] instr_queue_usage; instr_data_t [CVA6Cfg.INSTR_PER_FETCH-1:0] instr_data_in, instr_data_out; logic [CVA6Cfg.INSTR_PER_FETCH-1:0] push_instr, push_instr_fifo; - logic [ CVA6Cfg.INSTR_PER_FETCH-1:0] pop_instr; - logic [ CVA6Cfg.INSTR_PER_FETCH-1:0] instr_queue_full; - logic [ CVA6Cfg.INSTR_PER_FETCH-1:0] instr_queue_empty; - logic instr_overflow; + logic [CVA6Cfg.INSTR_PER_FETCH-1:0] pop_instr; + logic [CVA6Cfg.INSTR_PER_FETCH-1:0] instr_queue_full; + logic [CVA6Cfg.INSTR_PER_FETCH-1:0] instr_queue_empty; + logic instr_overflow; // address queue - logic [$clog2(ariane_pkg::FETCH_FIFO_DEPTH)-1:0] address_queue_usage; - logic [ CVA6Cfg.VLEN-1:0] address_out; - logic pop_address; - logic push_address; - logic full_address; - logic empty_address; - logic address_overflow; + logic [ CVA6Cfg.VLEN-1:0] address_out; + logic pop_address; + logic push_address; + logic full_address; + logic address_overflow; // input stream counter logic [CVA6Cfg.LOG2_INSTR_PER_FETCH-1:0] idx_is_d, idx_is_q; @@ -137,7 +132,6 @@ ariane_pkg::FETCH_FIFO_DEPTH logic [CVA6Cfg.INSTR_PER_FETCH*2-2:0] branch_mask_extended; logic [CVA6Cfg.INSTR_PER_FETCH-1:0] branch_mask; - logic branch_empty; logic [CVA6Cfg.INSTR_PER_FETCH-1:0] taken; // shift amount, e.g.: instructions we want to retire logic [CVA6Cfg.LOG2_INSTR_PER_FETCH:0] popcount; @@ -167,7 +161,7 @@ ariane_pkg::FETCH_FIFO_DEPTH ) i_lzc_branch_index ( .in_i (taken), // we want to count trailing zeros .cnt_o (branch_index), // first branch on branch_index - .empty_o(branch_empty) + .empty_o() ); @@ -237,7 +231,6 @@ ariane_pkg::FETCH_FIFO_DEPTH end else begin : gen_multiple_instr_per_fetch_without_C assign taken = '0; - assign branch_empty = '0; assign branch_index = '0; assign branch_mask_extended = '0; assign branch_mask = '0; @@ -376,7 +369,7 @@ ariane_pkg::FETCH_FIFO_DEPTH end fetch_entry_o[NID].instruction = instr_data_out[i].instr; fetch_entry_o[NID].ex.valid = instr_data_out[i].ex != ariane_pkg::FE_NONE; - fetch_entry_o[NID].ex.tval = {{64 - riscv::VLEN{1'b0}}, instr_data_out[i].ex_vaddr}; + fetch_entry_o[NID].ex.tval = {{64 - CVA6Cfg.VLEN{1'b0}}, instr_data_out[i].ex_vaddr}; fetch_entry_o[NID].branch_predict.cf = instr_data_out[i].cf; // Cannot output two CF the same cycle. pop_instr[i] = fetch_entry_fire[NID]; @@ -468,8 +461,9 @@ ariane_pkg::FETCH_FIFO_DEPTH // Make sure we don't save any instructions if we couldn't save the address assign push_instr_fifo[i] = push_instr[i] & ~address_overflow; cva6_fifo_v3 #( - .DEPTH (ariane_pkg::FETCH_FIFO_DEPTH), - .dtype (instr_data_t), + .FPGA_ALTERA(CVA6Cfg.FpgaAlteraEn), + .DEPTH(ariane_pkg::FETCH_FIFO_DEPTH), + .dtype(instr_data_t), .FPGA_EN(CVA6Cfg.FpgaEn) ) i_fifo_instr_data ( .clk_i (clk_i), @@ -478,7 +472,7 @@ ariane_pkg::FETCH_FIFO_DEPTH .testmode_i(1'b0), .full_o (instr_queue_full[i]), .empty_o (instr_queue_empty[i]), - .usage_o (instr_queue_usage[i]), + .usage_o (), .data_i (instr_data_in[i]), .push_i (push_instr_fifo[i]), .data_o (instr_data_out[i]), @@ -496,28 +490,26 @@ ariane_pkg::FETCH_FIFO_DEPTH end cva6_fifo_v3 #( - .DEPTH (ariane_pkg::FETCH_FIFO_DEPTH), // TODO(zarubaf): Fork out to separate param - .DATA_WIDTH(CVA6Cfg.VLEN), - .FPGA_EN (CVA6Cfg.FpgaEn) + .FPGA_ALTERA(CVA6Cfg.FpgaAlteraEn), + .DEPTH (ariane_pkg::FETCH_ADDR_FIFO_DEPTH), + .DATA_WIDTH (CVA6Cfg.VLEN), + .FPGA_EN (CVA6Cfg.FpgaEn) ) i_fifo_address ( .clk_i (clk_i), .rst_ni (rst_ni), .flush_i (flush_i), .testmode_i(1'b0), .full_o (full_address), - .empty_o (empty_address), - .usage_o (address_queue_usage), + .empty_o (), + .usage_o (), .data_i (predict_address_i), .push_i (push_address & ~full_address), .data_o (address_out), .pop_i (pop_address) ); - unread i_unread_address_fifo (.d_i(|{empty_address, address_queue_usage})); unread i_unread_branch_mask (.d_i(|branch_mask_extended)); - unread i_unread_lzc (.d_i(|{branch_empty})); unread i_unread_fifo_pos (.d_i(|fifo_pos_extended)); // we don't care about the lower signals - unread i_unread_instr_fifo (.d_i(|instr_queue_usage)); if (CVA6Cfg.RVC) begin : gen_pc_q_with_c always_ff @(posedge clk_i or negedge rst_ni) begin diff --git a/core/id_stage.sv b/core/id_stage.sv index b11f758ab3..864c74b329 100644 --- a/core/id_stage.sv +++ b/core/id_stage.sv @@ -21,7 +21,9 @@ module id_stage #( parameter type irq_ctrl_t = logic, parameter type scoreboard_entry_t = logic, parameter type interrupts_t = logic, - parameter interrupts_t INTERRUPTS = '0 + parameter interrupts_t INTERRUPTS = '0, + parameter type x_compressed_req_t = logic, + parameter type x_compressed_resp_t = logic ) ( // Subsystem Clock - SUBSYSTEM input logic clk_i, @@ -76,7 +78,13 @@ module id_stage #( // Trap sret - CSR_REGFILE input logic tsr_i, // Hypervisor user mode - CSR_REGFILE - input logic hu_i + input logic hu_i, + // CVXIF Compressed interface + input logic [CVA6Cfg.XLEN-1:0] hart_id_i, + input logic compressed_ready_i, + input x_compressed_resp_t compressed_resp_i, + output logic compressed_valid_o, + output x_compressed_req_t compressed_req_o ); // ID/ISSUE register stage typedef struct packed { @@ -93,12 +101,17 @@ module id_stage #( logic [CVA6Cfg.NrIssuePorts-1:0] is_illegal; logic [CVA6Cfg.NrIssuePorts-1:0] is_illegal_cmp; + logic [CVA6Cfg.NrIssuePorts-1:0] is_illegal_cvxif; logic [CVA6Cfg.NrIssuePorts-1:0][31:0] instruction; logic [CVA6Cfg.NrIssuePorts-1:0][31:0] compressed_instr; + logic [CVA6Cfg.NrIssuePorts-1:0][31:0] instruction_cvxif; logic [CVA6Cfg.NrIssuePorts-1:0] is_compressed; logic [CVA6Cfg.NrIssuePorts-1:0] is_compressed_cmp; + logic [CVA6Cfg.NrIssuePorts-1:0] is_compressed_cvxif; + logic [CVA6Cfg.NrIssuePorts-1:0] is_macro_instr_i; logic stall_instr_fetch; + logic stall_macro_deco; logic is_last_macro_instr_o; logic is_double_rd_macro_instr_o; @@ -126,25 +139,64 @@ module id_stage #( .is_macro_instr_i (is_macro_instr_i[0]), .clk_i (clk_i), .rst_ni (rst_ni), - .instr_o (instruction[0]), + .instr_o (instruction_cvxif[0]), .illegal_instr_i (is_illegal[0]), .is_compressed_i (is_compressed[0]), .issue_ack_i (issue_instr_ack_i[0]), - .illegal_instr_o (is_illegal_cmp[0]), - .is_compressed_o (is_compressed_cmp[0]), - .fetch_stall_o (stall_instr_fetch), + .illegal_instr_o (is_illegal_cvxif[0]), + .is_compressed_o (is_compressed_cvxif[0]), + .fetch_stall_o (stall_macro_deco), .is_last_macro_instr_o (is_last_macro_instr_o), .is_double_rd_macro_instr_o(is_double_rd_macro_instr_o) ); if (CVA6Cfg.SuperscalarEn) begin - assign instruction[CVA6Cfg.NrIssuePorts-1] = '0; - assign is_illegal_cmp[CVA6Cfg.NrIssuePorts-1] = '0; - assign is_compressed_cmp[CVA6Cfg.NrIssuePorts-1] = '0; + assign instruction_cvxif[CVA6Cfg.NrIssuePorts-1] = '0; + assign is_illegal_cvxif[CVA6Cfg.NrIssuePorts-1] = '0; + assign is_compressed_cvxif[CVA6Cfg.NrIssuePorts-1] = '0; end + cvxif_compressed_if_driver #( + .CVA6Cfg(CVA6Cfg), + .x_compressed_req_t(x_compressed_req_t), + .x_compressed_resp_t(x_compressed_resp_t) + ) i_cvxif_compressed_if_driver_i ( + .clk_i (clk_i), + .rst_ni (rst_ni), + .hart_id_i (hart_id_i), + .is_compressed_i (is_compressed_cvxif), + .is_illegal_i (is_illegal_cvxif), + .instruction_i (instruction_cvxif), + .is_compressed_o (is_compressed_cmp), + .is_illegal_o (is_illegal_cmp), + .instruction_o (instruction), + .stall_i (stall_macro_deco), + .stall_o (stall_instr_fetch), + .compressed_ready_i(compressed_ready_i), + .compressed_resp_i (compressed_resp_i), + .compressed_valid_o(compressed_valid_o), + .compressed_req_o (compressed_req_o) + ); end else begin - assign instruction = compressed_instr; - assign is_illegal_cmp = is_illegal; - assign is_compressed_cmp = is_compressed; + cvxif_compressed_if_driver #( + .CVA6Cfg(CVA6Cfg), + .x_compressed_req_t(x_compressed_req_t), + .x_compressed_resp_t(x_compressed_resp_t) + ) i_cvxif_compressed_if_driver_i ( + .clk_i (clk_i), + .rst_ni (rst_ni), + .hart_id_i (hart_id_i), + .is_compressed_i (is_compressed), + .is_illegal_i (is_illegal), + .instruction_i (compressed_instr), + .is_compressed_o (is_compressed_cmp), + .is_illegal_o (is_illegal_cmp), + .instruction_o (instruction), + .stall_i (1'b0), + .stall_o (stall_instr_fetch), + .compressed_ready_i(compressed_ready_i), + .compressed_resp_i (compressed_resp_i), + .compressed_valid_o(compressed_valid_o), + .compressed_req_o (compressed_req_o) + ); assign is_last_macro_instr_o = '0; assign is_double_rd_macro_instr_o = '0; end @@ -157,6 +209,11 @@ module id_stage #( assign is_macro_instr_i = '0; assign is_last_macro_instr_o = '0; assign is_double_rd_macro_instr_o = '0; + if (CVA6Cfg.CvxifEn) begin + assign compressed_valid_o = '0; + assign compressed_req_o.instr = '0; + assign compressed_req_o.hartid = hart_id_i; + end // TODO Add else to map x_compressed_if outputs to '0 ? end assign rvfi_is_compressed_o = is_compressed_cmp; diff --git a/core/include/ariane_pkg.sv b/core/include/ariane_pkg.sv index 1e2666f239..e729929d3e 100644 --- a/core/include/ariane_pkg.sv +++ b/core/include/ariane_pkg.sv @@ -167,6 +167,7 @@ package ariane_pkg; // leave as is (fails with >8 entries and wider fetch width) localparam int unsigned FETCH_FIFO_DEPTH = 4; + localparam int unsigned FETCH_ADDR_FIFO_DEPTH = 2; typedef enum logic [2:0] { NoCF, // No control flow prediction diff --git a/core/include/build_config_pkg.sv b/core/include/build_config_pkg.sv index fdfd4c0a6a..5d4808bb1c 100644 --- a/core/include/build_config_pkg.sv +++ b/core/include/build_config_pkg.sv @@ -32,7 +32,7 @@ package build_config_pkg; config_pkg::cva6_cfg_t cfg; cfg.XLEN = CVA6Cfg.XLEN; - cfg.VLEN = (CVA6Cfg.XLEN == 32) ? 32 : 64; + cfg.VLEN = CVA6Cfg.VLEN; cfg.PLEN = (CVA6Cfg.XLEN == 32) ? 34 : 56; cfg.GPLEN = (CVA6Cfg.XLEN == 32) ? 34 : 41; cfg.IS_XLEN32 = IS_XLEN32; @@ -42,6 +42,7 @@ package build_config_pkg; cfg.VMID_WIDTH = (CVA6Cfg.XLEN == 64) ? 14 : 1; cfg.FpgaEn = CVA6Cfg.FpgaEn; + cfg.FpgaAlteraEn = CVA6Cfg.FpgaAlteraEn; cfg.TechnoCut = CVA6Cfg.TechnoCut; cfg.SuperscalarEn = CVA6Cfg.SuperscalarEn; @@ -84,7 +85,9 @@ package build_config_pkg; cfg.XF16Vec = bit'(XF16Vec); cfg.XF16ALTVec = bit'(XF16ALTVec); cfg.XF8Vec = bit'(XF8Vec); + // Can take 2 or 3 in single issue. 4 or 6 in dual issue. cfg.NrRgprPorts = unsigned'(CVA6Cfg.SuperscalarEn ? 4 : 2); + // cfg.NrRgprPorts = unsigned'(CVA6Cfg.SuperscalarEn ? 6 : 3); cfg.NrWbPorts = unsigned'(NrWbPorts); cfg.EnableAccelerator = bit'(EnableAccelerator); cfg.PerfCounterEn = CVA6Cfg.PerfCounterEn; @@ -120,7 +123,7 @@ package build_config_pkg; cfg.AxiBurstWriteEn = CVA6Cfg.AxiBurstWriteEn; cfg.ICACHE_SET_ASSOC = CVA6Cfg.IcacheSetAssoc; - cfg.ICACHE_SET_ASSOC_WIDTH = $clog2(CVA6Cfg.IcacheSetAssoc); + cfg.ICACHE_SET_ASSOC_WIDTH = CVA6Cfg.IcacheSetAssoc > 1 ? $clog2(CVA6Cfg.IcacheSetAssoc) : CVA6Cfg.IcacheSetAssoc; cfg.ICACHE_INDEX_WIDTH = ICACHE_INDEX_WIDTH; cfg.ICACHE_TAG_WIDTH = cfg.PLEN - ICACHE_INDEX_WIDTH; cfg.ICACHE_LINE_WIDTH = CVA6Cfg.IcacheLineWidth; @@ -128,7 +131,7 @@ package build_config_pkg; cfg.DCacheType = CVA6Cfg.DCacheType; cfg.DcacheIdWidth = CVA6Cfg.DcacheIdWidth; cfg.DCACHE_SET_ASSOC = CVA6Cfg.DcacheSetAssoc; - cfg.DCACHE_SET_ASSOC_WIDTH = $clog2(CVA6Cfg.DcacheSetAssoc); + cfg.DCACHE_SET_ASSOC_WIDTH = CVA6Cfg.DcacheSetAssoc > 1 ? $clog2(CVA6Cfg.DcacheSetAssoc) : CVA6Cfg.DcacheSetAssoc; cfg.DCACHE_INDEX_WIDTH = DCACHE_INDEX_WIDTH; cfg.DCACHE_TAG_WIDTH = cfg.PLEN - DCACHE_INDEX_WIDTH; cfg.DCACHE_LINE_WIDTH = CVA6Cfg.DcacheLineWidth; @@ -165,6 +168,16 @@ package build_config_pkg; cfg.VpnLen = VpnLen; cfg.PtLevels = PtLevels; + cfg.X_NUM_RS = cfg.NrRgprPorts / cfg.NrIssuePorts; + cfg.X_ID_WIDTH = cfg.TRANS_ID_BITS; + cfg.X_RFR_WIDTH = cfg.XLEN; + cfg.X_RFW_WIDTH = cfg.XLEN; + cfg.X_NUM_HARTS = 1; + cfg.X_HARTID_WIDTH = cfg.XLEN; + cfg.X_DUALREAD = 0; + cfg.X_DUALWRITE = 0; + cfg.X_ISSUE_REGISTER_SPLIT = 0; + return cfg; endfunction diff --git a/core/include/config_pkg.sv b/core/include/config_pkg.sv index 7021162e4c..d711d5f262 100644 --- a/core/include/config_pkg.sv +++ b/core/include/config_pkg.sv @@ -48,6 +48,8 @@ package config_pkg; typedef struct packed { // General Purpose Register Size (in bits) int unsigned XLEN; + // Virtual address Size (in bits) + int unsigned VLEN; // Atomic RISC-V extension bit RVA; // Bit manipulation RISC-V extension @@ -166,11 +168,13 @@ package config_pkg; int unsigned FetchUserEn; // Width of fetch user field int unsigned FetchUserWidth; - // Is FPGA optimization of CV32A6 + // Is FPGA optimization of CV32A6 for Xilinx and Altera bit FpgaEn; + // Is FPGA optimization for Altera FPGA + bit FpgaAlteraEn; // Is Techno Cut instanciated bit TechnoCut; - // Enable superscalar with 2 issue ports and 2 commit ports + // Enable superscalar* with 2 issue ports and 2 commit ports. bit SuperscalarEn; // Number of commit ports. Forced to 2 if SuperscalarEn. int unsigned NrCommitPorts; @@ -212,6 +216,7 @@ package config_pkg; int unsigned VMID_WIDTH; bit FpgaEn; + bit FpgaAlteraEn; bit TechnoCut; bit SuperscalarEn; @@ -337,6 +342,17 @@ package config_pkg; vm_mode_t MODE_SV; int unsigned SV; int unsigned SVX; + + int unsigned X_NUM_RS; + int unsigned X_ID_WIDTH; + int unsigned X_RFR_WIDTH; + int unsigned X_RFW_WIDTH; + int unsigned X_NUM_HARTS; + int unsigned X_HARTID_WIDTH; + int unsigned X_DUALREAD; + int unsigned X_DUALWRITE; + int unsigned X_ISSUE_REGISTER_SPLIT; + } cva6_cfg_t; /// Empty configuration to sanity check proper parameter passing. Whenever @@ -355,6 +371,8 @@ package config_pkg; assert (Cfg.NrExecuteRegionRules <= NrMaxRules); assert (Cfg.NrCachedRegionRules <= NrMaxRules); assert (Cfg.NrPMPEntries <= 64); + assert (!(Cfg.SuperscalarEn && Cfg.RVF)); + assert (!(Cfg.SuperscalarEn && Cfg.RVZCMP)); `endif // pragma translate_on endfunction @@ -378,11 +396,15 @@ package config_pkg; function automatic logic is_inside_execute_regions(cva6_cfg_t Cfg, logic [63:0] address); // if we don't specify any region we assume everything is accessible logic [NrMaxRules-1:0] pass; - pass = '0; - for (int unsigned k = 0; k < Cfg.NrExecuteRegionRules; k++) begin - pass[k] = range_check(Cfg.ExecuteRegionAddrBase[k], Cfg.ExecuteRegionLength[k], address); + if (Cfg.NrExecuteRegionRules != 0) begin + pass = '0; + for (int unsigned k = 0; k < Cfg.NrExecuteRegionRules; k++) begin + pass[k] = range_check(Cfg.ExecuteRegionAddrBase[k], Cfg.ExecuteRegionLength[k], address); + end + return |pass; + end else begin + return 1; end - return |pass; endfunction : is_inside_execute_regions function automatic logic is_inside_cacheable_regions(cva6_cfg_t Cfg, logic [63:0] address); diff --git a/core/include/cv32a60x_config_pkg.sv b/core/include/cv32a60x_config_pkg.sv new file mode 100644 index 0000000000..9604b24d39 --- /dev/null +++ b/core/include/cv32a60x_config_pkg.sv @@ -0,0 +1,102 @@ +// Copyright 2022 Thales DIS design services SAS +// +// Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0 +// You may obtain a copy of the License at https://solderpad.org/licenses/ +// +// Original Author: Jean-Roch COULON - Thales + +package cva6_config_pkg; + + localparam CVA6ConfigXlen = 32; + + localparam CVA6ConfigRvfiTrace = 1; + + localparam CVA6ConfigAxiIdWidth = 4; // axi_pkg.sv + localparam CVA6ConfigAxiAddrWidth = 64; // axi_pkg.sv + localparam CVA6ConfigAxiDataWidth = 64; // axi_pkg.sv + localparam CVA6ConfigDataUserWidth = 32; // axi_pkg.sv + + localparam config_pkg::cva6_user_cfg_t cva6_cfg = '{ + XLEN: unsigned'(CVA6ConfigXlen), + VLEN: unsigned'(32), + FpgaEn: bit'(0), + FpgaAlteraEn: bit'(0), + TechnoCut: bit'(1), + SuperscalarEn: bit'(0), + NrCommitPorts: unsigned'(1), + AxiAddrWidth: unsigned'(CVA6ConfigAxiAddrWidth), + AxiDataWidth: unsigned'(CVA6ConfigAxiDataWidth), + AxiIdWidth: unsigned'(CVA6ConfigAxiIdWidth), + AxiUserWidth: unsigned'(CVA6ConfigDataUserWidth), + MemTidWidth: unsigned'(CVA6ConfigAxiIdWidth), + NrLoadBufEntries: unsigned'(2), + RVF: bit'(0), + RVD: bit'(0), + XF16: bit'(0), + XF16ALT: bit'(0), + XF8: bit'(0), + RVA: bit'(0), + RVB: bit'(1), + RVV: bit'(0), + RVC: bit'(1), + RVH: bit'(0), + RVZCB: bit'(1), + RVZCMP: bit'(1), + XFVec: bit'(0), + CvxifEn: bit'(1), + RVZiCond: bit'(0), + RVZicntr: bit'(0), + RVZihpm: bit'(0), + NrScoreboardEntries: unsigned'(4), + PerfCounterEn: bit'(0), + MmuPresent: bit'(0), + RVS: bit'(0), + RVU: bit'(0), + HaltAddress: 64'h800, + ExceptionAddress: 64'h808, + RASDepth: unsigned'(2), + BTBEntries: unsigned'(0), + BHTEntries: unsigned'(32), + DmBaseAddress: 64'h0, + TvalEn: bit'(0), + DirectVecOnly: bit'(1), + NrPMPEntries: unsigned'(8), + PMPCfgRstVal: {64{64'h0}}, + PMPAddrRstVal: {64{64'h0}}, + PMPEntryReadOnly: 64'd0, + NOCType: config_pkg::NOC_TYPE_AXI4_ATOP, + NrNonIdempotentRules: unsigned'(0), + NonIdempotentAddrBase: 1024'({64'b0, 64'b0}), + NonIdempotentLength: 1024'({64'b0, 64'b0}), + NrExecuteRegionRules: unsigned'(0), + ExecuteRegionAddrBase: 1024'({64'h8000_0000, 64'h1_0000, 64'h0}), + ExecuteRegionLength: 1024'({64'h40000000, 64'h10000, 64'h1000}), + NrCachedRegionRules: unsigned'(1), + CachedRegionAddrBase: 1024'({64'h8000_0000}), + CachedRegionLength: 1024'({64'h40000000}), + MaxOutstandingStores: unsigned'(7), + DebugEn: bit'(0), + AxiBurstWriteEn: bit'(0), + IcacheByteSize: unsigned'(2048), + IcacheSetAssoc: unsigned'(2), + IcacheLineWidth: unsigned'(128), + DCacheType: config_pkg::HPDCACHE, + DcacheByteSize: unsigned'(2028), + DcacheSetAssoc: unsigned'(2), + DcacheLineWidth: unsigned'(128), + DataUserEn: unsigned'(1), + WtDcacheWbufDepth: int'(8), + FetchUserWidth: unsigned'(32), + FetchUserEn: unsigned'(1), + InstrTlbEntries: int'(2), + DataTlbEntries: int'(2), + UseSharedTlb: bit'(1), + SharedTlbDepth: int'(64), + NrLoadPipeRegs: int'(0), + NrStorePipeRegs: int'(0), + DcacheIdWidth: int'(1) + }; + +endpackage diff --git a/core/include/cv32a60x_config_pkg_deprecated.sv b/core/include/cv32a60x_config_pkg_deprecated.sv deleted file mode 100644 index c2637baf1e..0000000000 --- a/core/include/cv32a60x_config_pkg_deprecated.sv +++ /dev/null @@ -1,155 +0,0 @@ -// Copyright 2022 Thales DIS design services SAS -// -// Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0 -// You may obtain a copy of the License at https://solderpad.org/licenses/ -// -// Original Author: Jean-Roch COULON - Thales - - -package cva6_config_pkg; - - localparam CVA6ConfigXlen = 32; - - localparam CVA6ConfigRVF = 0; - localparam CVA6ConfigF16En = 0; - localparam CVA6ConfigF16AltEn = 0; - localparam CVA6ConfigF8En = 0; - localparam CVA6ConfigFVecEn = 0; - - localparam CVA6ConfigCvxifEn = 1; - localparam CVA6ConfigCExtEn = 1; - localparam CVA6ConfigZcbExtEn = 1; - localparam CVA6ConfigZcmpExtEn = 1; - localparam CVA6ConfigAExtEn = 1; - localparam CVA6ConfigHExtEn = 0; // always disabled - localparam CVA6ConfigBExtEn = 1; - localparam CVA6ConfigVExtEn = 0; - localparam CVA6ConfigRVZiCond = 1; - - localparam CVA6ConfigAxiIdWidth = 4; - localparam CVA6ConfigAxiAddrWidth = 64; - localparam CVA6ConfigAxiDataWidth = 64; - localparam CVA6ConfigFetchUserEn = 0; - localparam CVA6ConfigFetchUserWidth = CVA6ConfigXlen; - localparam CVA6ConfigDataUserEn = 0; - localparam CVA6ConfigDataUserWidth = CVA6ConfigXlen; - - localparam CVA6ConfigIcacheByteSize = 16384; - localparam CVA6ConfigIcacheSetAssoc = 4; - localparam CVA6ConfigIcacheLineWidth = 128; - localparam CVA6ConfigDcacheByteSize = 32768; - localparam CVA6ConfigDcacheSetAssoc = 8; - localparam CVA6ConfigDcacheLineWidth = 128; - - localparam CVA6ConfigDcacheIdWidth = 1; - localparam CVA6ConfigMemTidWidth = 2; - - localparam CVA6ConfigWtDcacheWbufDepth = 8; - - localparam CVA6ConfigSuperscalarEn = 0; - localparam CVA6ConfigNrCommitPorts = 1; - localparam CVA6ConfigNrScoreboardEntries = 4; - - localparam CVA6ConfigFpgaEn = 0; - - localparam CVA6ConfigNrLoadPipeRegs = 1; - localparam CVA6ConfigNrStorePipeRegs = 0; - localparam CVA6ConfigNrLoadBufEntries = 2; - - localparam CVA6ConfigRASDepth = 0; - localparam CVA6ConfigBTBEntries = 0; - localparam CVA6ConfigBHTEntries = 0; - - localparam CVA6ConfigTvalEn = 1; - - localparam CVA6ConfigNrPMPEntries = 8; - - localparam CVA6ConfigPerfCounterEn = 0; - - localparam config_pkg::cache_type_t CVA6ConfigDcacheType = config_pkg::WT; - - localparam CVA6ConfigMmuPresent = 1; - - localparam CVA6ConfigRvfiTrace = 1; - - localparam config_pkg::cva6_user_cfg_t cva6_cfg = '{ - XLEN: unsigned'(CVA6ConfigXlen), - FpgaEn: bit'(CVA6ConfigFpgaEn), - TechnoCut: bit'(0), - NrCommitPorts: unsigned'(CVA6ConfigNrCommitPorts), - AxiAddrWidth: unsigned'(CVA6ConfigAxiAddrWidth), - AxiDataWidth: unsigned'(CVA6ConfigAxiDataWidth), - AxiIdWidth: unsigned'(CVA6ConfigAxiIdWidth), - AxiUserWidth: unsigned'(CVA6ConfigDataUserWidth), - MemTidWidth: unsigned'(CVA6ConfigMemTidWidth), - NrLoadBufEntries: unsigned'(CVA6ConfigNrLoadBufEntries), - RVF: bit'(CVA6ConfigRVF), - RVD: bit'(CVA6ConfigRVF), - XF16: bit'(CVA6ConfigF16En), - XF16ALT: bit'(CVA6ConfigF16AltEn), - XF8: bit'(CVA6ConfigF8En), - RVA: bit'(CVA6ConfigAExtEn), - RVB: bit'(CVA6ConfigBExtEn), - RVV: bit'(CVA6ConfigVExtEn), - RVC: bit'(CVA6ConfigCExtEn), - RVH: bit'(CVA6ConfigHExtEn), - RVZCB: bit'(CVA6ConfigZcbExtEn), - RVZCMP: bit'(CVA6ConfigZcmpExtEn), - XFVec: bit'(CVA6ConfigFVecEn), - CvxifEn: bit'(CVA6ConfigCvxifEn), - RVZiCond: bit'(CVA6ConfigRVZiCond), - RVZicntr: bit'(1), - RVZihpm: bit'(1), - NrScoreboardEntries: unsigned'(CVA6ConfigNrScoreboardEntries), - PerfCounterEn: bit'(CVA6ConfigPerfCounterEn), - MmuPresent: bit'(CVA6ConfigMmuPresent), - RVS: bit'(1), - RVU: bit'(1), - HaltAddress: 64'h800, - ExceptionAddress: 64'h808, - RASDepth: unsigned'(CVA6ConfigRASDepth), - BTBEntries: unsigned'(CVA6ConfigBTBEntries), - BHTEntries: unsigned'(CVA6ConfigBHTEntries), - DmBaseAddress: 64'h0, - TvalEn: bit'(CVA6ConfigTvalEn), - DirectVecOnly: bit'(0), - NrPMPEntries: unsigned'(CVA6ConfigNrPMPEntries), - PMPCfgRstVal: {64{64'h0}}, - PMPAddrRstVal: {64{64'h0}}, - PMPEntryReadOnly: 64'd0, - NOCType: config_pkg::NOC_TYPE_AXI4_ATOP, - NrNonIdempotentRules: unsigned'(2), - NonIdempotentAddrBase: 1024'({64'b0, 64'b0}), - NonIdempotentLength: 1024'({64'b0, 64'b0}), - NrExecuteRegionRules: unsigned'(3), - ExecuteRegionAddrBase: 1024'({64'h8000_0000, 64'h1_0000, 64'h0}), - ExecuteRegionLength: 1024'({64'h40000000, 64'h10000, 64'h1000}), - NrCachedRegionRules: unsigned'(1), - CachedRegionAddrBase: 1024'({64'h8000_0000}), - CachedRegionLength: 1024'({64'h40000000}), - MaxOutstandingStores: unsigned'(7), - DebugEn: bit'(1), - AxiBurstWriteEn: bit'(0), - IcacheByteSize: unsigned'(CVA6ConfigIcacheByteSize), - IcacheSetAssoc: unsigned'(CVA6ConfigIcacheSetAssoc), - IcacheLineWidth: unsigned'(CVA6ConfigIcacheLineWidth), - DCacheType: CVA6ConfigDcacheType, - DcacheByteSize: unsigned'(CVA6ConfigDcacheByteSize), - DcacheSetAssoc: unsigned'(CVA6ConfigDcacheSetAssoc), - DcacheLineWidth: unsigned'(CVA6ConfigDcacheLineWidth), - DataUserEn: unsigned'(CVA6ConfigDataUserEn), - WtDcacheWbufDepth: int'(CVA6ConfigWtDcacheWbufDepth), - FetchUserWidth: unsigned'(CVA6ConfigFetchUserWidth), - FetchUserEn: unsigned'(CVA6ConfigFetchUserEn), - InstrTlbEntries: int'(2), - DataTlbEntries: int'(2), - UseSharedTlb: bit'(1), - SharedTlbDepth: int'(64), - NrLoadPipeRegs: int'(CVA6ConfigNrLoadPipeRegs), - NrStorePipeRegs: int'(CVA6ConfigNrStorePipeRegs), - DcacheIdWidth: int'(CVA6ConfigDcacheIdWidth) - }; - -endpackage diff --git a/core/include/cv32a65x_config_pkg.sv b/core/include/cv32a65x_config_pkg.sv index 10b7e88afa..d9d028fa16 100644 --- a/core/include/cv32a65x_config_pkg.sv +++ b/core/include/cv32a65x_config_pkg.sv @@ -18,13 +18,13 @@ package cva6_config_pkg; localparam CVA6ConfigAxiDataWidth = 64; // axi_pkg.sv localparam CVA6ConfigDataUserWidth = 32; // axi_pkg.sv - localparam CVA6ConfigNrScoreboardEntries = 4; // cvxif_pkg.sv - localparam config_pkg::cva6_user_cfg_t cva6_cfg = '{ XLEN: unsigned'(CVA6ConfigXlen), + VLEN: unsigned'(32), FpgaEn: bit'(0), + FpgaAlteraEn: bit'(0), TechnoCut: bit'(1), - SuperscalarEn: bit'(0), + SuperscalarEn: bit'(1), NrCommitPorts: unsigned'(1), AxiAddrWidth: unsigned'(CVA6ConfigAxiAddrWidth), AxiDataWidth: unsigned'(CVA6ConfigAxiDataWidth), @@ -49,7 +49,7 @@ package cva6_config_pkg; RVZiCond: bit'(0), RVZicntr: bit'(0), RVZihpm: bit'(0), - NrScoreboardEntries: unsigned'(CVA6ConfigNrScoreboardEntries), + NrScoreboardEntries: unsigned'(8), PerfCounterEn: bit'(0), MmuPresent: bit'(0), RVS: bit'(0), @@ -67,10 +67,10 @@ package cva6_config_pkg; PMPAddrRstVal: {64{64'h0}}, PMPEntryReadOnly: 64'd0, NOCType: config_pkg::NOC_TYPE_AXI4_ATOP, - NrNonIdempotentRules: unsigned'(2), + NrNonIdempotentRules: unsigned'(0), NonIdempotentAddrBase: 1024'({64'b0, 64'b0}), NonIdempotentLength: 1024'({64'b0, 64'b0}), - NrExecuteRegionRules: unsigned'(3), + NrExecuteRegionRules: unsigned'(0), ExecuteRegionAddrBase: 1024'({64'h8000_0000, 64'h1_0000, 64'h0}), ExecuteRegionLength: 1024'({64'h40000000, 64'h10000, 64'h1000}), NrCachedRegionRules: unsigned'(1), @@ -87,7 +87,7 @@ package cva6_config_pkg; DcacheSetAssoc: unsigned'(2), DcacheLineWidth: unsigned'(128), DataUserEn: unsigned'(1), - WtDcacheWbufDepth: int'(2), + WtDcacheWbufDepth: int'(8), FetchUserWidth: unsigned'(32), FetchUserEn: unsigned'(1), InstrTlbEntries: int'(2), diff --git a/core/include/cv32a6_embedded_config_pkg_deprecated.sv b/core/include/cv32a6_embedded_config_pkg_deprecated.sv index b8990ae566..ff6a1fcd41 100644 --- a/core/include/cv32a6_embedded_config_pkg_deprecated.sv +++ b/core/include/cv32a6_embedded_config_pkg_deprecated.sv @@ -51,8 +51,6 @@ package cva6_config_pkg; localparam CVA6ConfigNrCommitPorts = 1; localparam CVA6ConfigNrScoreboardEntries = 4; - localparam CVA6ConfigFpgaEn = 0; - localparam CVA6ConfigNrLoadPipeRegs = 0; localparam CVA6ConfigNrStorePipeRegs = 0; localparam CVA6ConfigNrLoadBufEntries = 1; @@ -75,7 +73,8 @@ package cva6_config_pkg; localparam config_pkg::cva6_user_cfg_t cva6_cfg = '{ XLEN: unsigned'(CVA6ConfigXlen), - FpgaEn: bit'(CVA6ConfigFpgaEn), + FpgaEn: bit'(0), // for Xilinx and Altera + FpgaAlteraEn: bit'(0), // for Altera (only) TechnoCut: bit'(0), NrCommitPorts: unsigned'(CVA6ConfigNrCommitPorts), AxiAddrWidth: unsigned'(CVA6ConfigAxiAddrWidth), diff --git a/core/include/cv32a6_ima_sv32_fpga_config_pkg.sv b/core/include/cv32a6_ima_sv32_fpga_config_pkg.sv index 0e6510ba5c..be7ff3ee14 100644 --- a/core/include/cv32a6_ima_sv32_fpga_config_pkg.sv +++ b/core/include/cv32a6_ima_sv32_fpga_config_pkg.sv @@ -50,8 +50,6 @@ package cva6_config_pkg; localparam CVA6ConfigNrScoreboardEntries = 4; - localparam CVA6ConfigFpgaEn = 1; - localparam CVA6ConfigNrLoadPipeRegs = 1; localparam CVA6ConfigNrStorePipeRegs = 0; localparam CVA6ConfigNrLoadBufEntries = 2; @@ -74,7 +72,9 @@ package cva6_config_pkg; localparam config_pkg::cva6_user_cfg_t cva6_cfg = '{ XLEN: unsigned'(CVA6ConfigXlen), - FpgaEn: bit'(CVA6ConfigFpgaEn), + VLEN: unsigned'(32), + FpgaEn: bit'(1), // for Xilinx and Altera + FpgaAlteraEn: bit'(0), // for Altera (only) TechnoCut: bit'(0), SuperscalarEn: bit'(0), NrCommitPorts: unsigned'(1), diff --git a/core/include/cv32a6_imac_sv0_config_pkg.sv b/core/include/cv32a6_imac_sv0_config_pkg.sv index 1fe4d3f551..2687370b26 100644 --- a/core/include/cv32a6_imac_sv0_config_pkg.sv +++ b/core/include/cv32a6_imac_sv0_config_pkg.sv @@ -51,8 +51,6 @@ package cva6_config_pkg; localparam CVA6ConfigNrScoreboardEntries = 8; localparam CVA6ConfigNrLoadBufEntries = 2; - localparam CVA6ConfigFpgaEn = 0; - localparam CVA6ConfigNrLoadPipeRegs = 1; localparam CVA6ConfigNrStorePipeRegs = 0; @@ -74,7 +72,9 @@ package cva6_config_pkg; localparam config_pkg::cva6_user_cfg_t cva6_cfg = '{ XLEN: unsigned'(CVA6ConfigXlen), - FpgaEn: bit'(CVA6ConfigFpgaEn), + VLEN: unsigned'(32), + FpgaEn: bit'(0), // for Xilinx and Altera + FpgaAlteraEn: bit'(0), // for Altera (only) TechnoCut: bit'(0), SuperscalarEn: bit'(0), NrCommitPorts: unsigned'(2), diff --git a/core/include/cv32a6_imac_sv32_config_pkg.sv b/core/include/cv32a6_imac_sv32_config_pkg.sv index 27ad32d10c..9c2e622947 100644 --- a/core/include/cv32a6_imac_sv32_config_pkg.sv +++ b/core/include/cv32a6_imac_sv32_config_pkg.sv @@ -43,15 +43,13 @@ package cva6_config_pkg; localparam CVA6ConfigDcacheSetAssoc = 8; localparam CVA6ConfigDcacheLineWidth = 128; - localparam CVA6ConfigDcacheIdWidth = 1; - localparam CVA6ConfigMemTidWidth = 2; + localparam CVA6ConfigDcacheIdWidth = 3; + localparam CVA6ConfigMemTidWidth = 4; localparam CVA6ConfigWtDcacheWbufDepth = 8; localparam CVA6ConfigNrScoreboardEntries = 8; - localparam CVA6ConfigFpgaEn = 0; - localparam CVA6ConfigNrLoadPipeRegs = 1; localparam CVA6ConfigNrStorePipeRegs = 0; localparam CVA6ConfigNrLoadBufEntries = 2; @@ -66,7 +64,7 @@ package cva6_config_pkg; localparam CVA6ConfigPerfCounterEn = 1; - localparam config_pkg::cache_type_t CVA6ConfigDcacheType = config_pkg::WT; + localparam config_pkg::cache_type_t CVA6ConfigDcacheType = config_pkg::HPDCACHE; localparam CVA6ConfigMmuPresent = 1; @@ -74,7 +72,9 @@ package cva6_config_pkg; localparam config_pkg::cva6_user_cfg_t cva6_cfg = '{ XLEN: unsigned'(CVA6ConfigXlen), - FpgaEn: bit'(CVA6ConfigFpgaEn), + VLEN: unsigned'(32), + FpgaEn: bit'(0), // for Xilinx and Altera + FpgaAlteraEn: bit'(0), // for Altera (only) TechnoCut: bit'(0), SuperscalarEn: bit'(0), NrCommitPorts: unsigned'(2), @@ -119,9 +119,9 @@ package cva6_config_pkg; PMPAddrRstVal: {64{64'h0}}, PMPEntryReadOnly: 64'd0, NOCType: config_pkg::NOC_TYPE_AXI4_ATOP, - NrNonIdempotentRules: unsigned'(2), - NonIdempotentAddrBase: 1024'({64'b0, 64'b0}), - NonIdempotentLength: 1024'({64'b0, 64'b0}), + NrNonIdempotentRules: unsigned'(1), + NonIdempotentAddrBase: 1024'({64'b0}), + NonIdempotentLength: 1024'({64'h8000_0000}), NrExecuteRegionRules: unsigned'(3), ExecuteRegionAddrBase: 1024'({64'h8000_0000, 64'h1_0000, 64'h0}), ExecuteRegionLength: 1024'({64'h40000000, 64'h10000, 64'h1000}), diff --git a/core/include/cv32a6_imafc_sv32_config_pkg.sv b/core/include/cv32a6_imafc_sv32_config_pkg.sv index e00ba727fd..6ad09136e4 100644 --- a/core/include/cv32a6_imafc_sv32_config_pkg.sv +++ b/core/include/cv32a6_imafc_sv32_config_pkg.sv @@ -50,8 +50,6 @@ package cva6_config_pkg; localparam CVA6ConfigNrScoreboardEntries = 8; - localparam CVA6ConfigFpgaEn = 0; - localparam CVA6ConfigNrLoadPipeRegs = 1; localparam CVA6ConfigNrStorePipeRegs = 0; localparam CVA6ConfigNrLoadBufEntries = 2; @@ -74,7 +72,9 @@ package cva6_config_pkg; localparam config_pkg::cva6_user_cfg_t cva6_cfg = '{ XLEN: unsigned'(CVA6ConfigXlen), - FpgaEn: bit'(CVA6ConfigFpgaEn), + VLEN: unsigned'(32), + FpgaEn: bit'(0), // for Xilinx and Altera + FpgaAlteraEn: bit'(0), // for Altera (only) TechnoCut: bit'(0), SuperscalarEn: bit'(0), NrCommitPorts: unsigned'(2), diff --git a/core/include/cv64a6_imadfcv_sv39_polara_config_pkg.sv b/core/include/cv64a6_imadfcv_sv39_polara_config_pkg.sv index 96315e873d..0422eef3bf 100644 --- a/core/include/cv64a6_imadfcv_sv39_polara_config_pkg.sv +++ b/core/include/cv64a6_imadfcv_sv39_polara_config_pkg.sv @@ -50,8 +50,6 @@ package cva6_config_pkg; localparam CVA6ConfigNrScoreboardEntries = 8; - localparam CVA6ConfigFpgaEn = 0; - localparam CVA6ConfigNrLoadPipeRegs = 1; localparam CVA6ConfigNrStorePipeRegs = 0; localparam CVA6ConfigNrLoadBufEntries = 2; @@ -74,7 +72,9 @@ package cva6_config_pkg; localparam config_pkg::cva6_user_cfg_t cva6_cfg = '{ XLEN: unsigned'(CVA6ConfigXlen), - FpgaEn: bit'(CVA6ConfigFpgaEn), + VLEN: unsigned'(64), + FpgaEn: bit'(0), // for Xilinx and Altera + FpgaAlteraEn: bit'(0), // for Altera (only) TechnoCut: bit'(0), SuperscalarEn: bit'(0), NrCommitPorts: unsigned'(2), diff --git a/core/include/cv64a6_imafdc_sv39_config_pkg.sv b/core/include/cv64a6_imafdc_sv39_config_pkg.sv index 0f02d00657..8050e88f27 100644 --- a/core/include/cv64a6_imafdc_sv39_config_pkg.sv +++ b/core/include/cv64a6_imafdc_sv39_config_pkg.sv @@ -50,8 +50,6 @@ package cva6_config_pkg; localparam CVA6ConfigNrScoreboardEntries = 8; - localparam CVA6ConfigFpgaEn = 0; - localparam CVA6ConfigNrLoadPipeRegs = 1; localparam CVA6ConfigNrStorePipeRegs = 0; localparam CVA6ConfigNrLoadBufEntries = 2; @@ -74,7 +72,9 @@ package cva6_config_pkg; localparam config_pkg::cva6_user_cfg_t cva6_cfg = '{ XLEN: unsigned'(CVA6ConfigXlen), - FpgaEn: bit'(CVA6ConfigFpgaEn), + VLEN: unsigned'(64), + FpgaEn: bit'(0), // for Xilinx and Altera + FpgaAlteraEn: bit'(0), // for Altera (only) TechnoCut: bit'(0), SuperscalarEn: bit'(0), NrCommitPorts: unsigned'(2), diff --git a/core/include/cv64a6_imafdc_sv39_hpdcache_config_pkg.sv b/core/include/cv64a6_imafdc_sv39_hpdcache_config_pkg.sv index 32369a3ab8..36c4397ade 100644 --- a/core/include/cv64a6_imafdc_sv39_hpdcache_config_pkg.sv +++ b/core/include/cv64a6_imafdc_sv39_hpdcache_config_pkg.sv @@ -57,8 +57,6 @@ package cva6_config_pkg; localparam CVA6ConfigNrScoreboardEntries = 8; - localparam CVA6ConfigFpgaEn = 0; - localparam CVA6ConfigNrLoadPipeRegs = 1; localparam CVA6ConfigNrStorePipeRegs = 0; localparam CVA6ConfigNrLoadBufEntries = 8; @@ -81,7 +79,9 @@ package cva6_config_pkg; localparam config_pkg::cva6_user_cfg_t cva6_cfg = '{ XLEN: unsigned'(CVA6ConfigXlen), - FpgaEn: bit'(CVA6ConfigFpgaEn), + VLEN: unsigned'(64), + FpgaEn: bit'(0), // for Xilinx and Altera + FpgaAlteraEn: bit'(0), // for Altera (only) TechnoCut: bit'(0), SuperscalarEn: bit'(0), NrCommitPorts: unsigned'(2), diff --git a/core/include/cv64a6_imafdc_sv39_openpiton_config_pkg.sv b/core/include/cv64a6_imafdc_sv39_openpiton_config_pkg.sv index ae6d9abd9c..16f329c456 100644 --- a/core/include/cv64a6_imafdc_sv39_openpiton_config_pkg.sv +++ b/core/include/cv64a6_imafdc_sv39_openpiton_config_pkg.sv @@ -50,8 +50,6 @@ package cva6_config_pkg; localparam CVA6ConfigNrScoreboardEntries = 8; - localparam CVA6ConfigFpgaEn = 0; - localparam CVA6ConfigNrLoadPipeRegs = 1; localparam CVA6ConfigNrStorePipeRegs = 0; localparam CVA6ConfigNrLoadBufEntries = 2; @@ -74,7 +72,9 @@ package cva6_config_pkg; localparam config_pkg::cva6_user_cfg_t cva6_cfg = '{ XLEN: unsigned'(CVA6ConfigXlen), - FpgaEn: bit'(CVA6ConfigFpgaEn), + VLEN: unsigned'(64), + FpgaEn: bit'(0), // for Xilinx and Altera + FpgaAlteraEn: bit'(0), // for Altera (only) TechnoCut: bit'(0), SuperscalarEn: bit'(0), NrCommitPorts: unsigned'(2), diff --git a/core/include/cv64a6_imafdc_sv39_wb_config_pkg.sv b/core/include/cv64a6_imafdc_sv39_wb_config_pkg.sv index 339a5803c7..b785da8296 100644 --- a/core/include/cv64a6_imafdc_sv39_wb_config_pkg.sv +++ b/core/include/cv64a6_imafdc_sv39_wb_config_pkg.sv @@ -50,8 +50,6 @@ package cva6_config_pkg; localparam CVA6ConfigNrScoreboardEntries = 8; - localparam CVA6ConfigFpgaEn = 0; - localparam CVA6ConfigNrLoadPipeRegs = 1; localparam CVA6ConfigNrStorePipeRegs = 0; localparam CVA6ConfigNrLoadBufEntries = 2; @@ -74,7 +72,9 @@ package cva6_config_pkg; localparam config_pkg::cva6_user_cfg_t cva6_cfg = '{ XLEN: unsigned'(CVA6ConfigXlen), - FpgaEn: bit'(CVA6ConfigFpgaEn), + VLEN: unsigned'(64), + FpgaEn: bit'(0), // for Xilinx and Altera + FpgaAlteraEn: bit'(0), // for Altera (only) TechnoCut: bit'(0), SuperscalarEn: bit'(0), NrCommitPorts: unsigned'(2), diff --git a/core/include/cv64a6_imafdch_sv39_config_pkg.sv b/core/include/cv64a6_imafdch_sv39_config_pkg.sv index 560b0b8468..6e4d755ea1 100644 --- a/core/include/cv64a6_imafdch_sv39_config_pkg.sv +++ b/core/include/cv64a6_imafdch_sv39_config_pkg.sv @@ -50,8 +50,6 @@ package cva6_config_pkg; localparam CVA6ConfigNrScoreboardEntries = 8; - localparam CVA6ConfigFpgaEn = 0; - localparam CVA6ConfigNrLoadPipeRegs = 1; localparam CVA6ConfigNrStorePipeRegs = 0; localparam CVA6ConfigNrLoadBufEntries = 2; @@ -74,7 +72,9 @@ package cva6_config_pkg; localparam config_pkg::cva6_user_cfg_t cva6_cfg = '{ XLEN: unsigned'(CVA6ConfigXlen), - FpgaEn: bit'(CVA6ConfigFpgaEn), + VLEN: unsigned'(64), + FpgaEn: bit'(0), // for Xilinx and Altera + FpgaAlteraEn: bit'(0), // for Altera (only) TechnoCut: bit'(0), SuperscalarEn: bit'(0), NrCommitPorts: unsigned'(2), diff --git a/core/include/cv64a6_imafdch_sv39_wb_config_pkg.sv b/core/include/cv64a6_imafdch_sv39_wb_config_pkg.sv index 7a05878ce6..49bc7693b2 100644 --- a/core/include/cv64a6_imafdch_sv39_wb_config_pkg.sv +++ b/core/include/cv64a6_imafdch_sv39_wb_config_pkg.sv @@ -50,8 +50,6 @@ package cva6_config_pkg; localparam CVA6ConfigNrScoreboardEntries = 8; - localparam CVA6ConfigFpgaEn = 0; - localparam CVA6ConfigNrLoadPipeRegs = 1; localparam CVA6ConfigNrStorePipeRegs = 0; localparam CVA6ConfigNrLoadBufEntries = 2; @@ -74,8 +72,10 @@ package cva6_config_pkg; localparam config_pkg::cva6_user_cfg_t cva6_cfg = '{ XLEN: unsigned'(CVA6ConfigXlen), - FpgaEn: bit'(CVA6ConfigFpgaEn), - TechnoCut: bit'(CVA6ConfigTechnoCut), + VLEN: unsigned'(64), + FpgaEn: bit'(0), // for Xilinx and Altera + FpgaAlteraEn: bit'(0), // for Altera (only) + TechnoCut: bit'(0), SuperscalarEn: bit'(0), NrCommitPorts: unsigned'(2), AxiAddrWidth: unsigned'(CVA6ConfigAxiAddrWidth), diff --git a/core/include/cv64a6_imafdcv_sv39_config_pkg.sv b/core/include/cv64a6_imafdcv_sv39_config_pkg.sv index c31c2b3819..2c642f5345 100644 --- a/core/include/cv64a6_imafdcv_sv39_config_pkg.sv +++ b/core/include/cv64a6_imafdcv_sv39_config_pkg.sv @@ -50,8 +50,6 @@ package cva6_config_pkg; localparam CVA6ConfigNrScoreboardEntries = 8; - localparam CVA6ConfigFpgaEn = 0; - localparam CVA6ConfigNrLoadPipeRegs = 1; localparam CVA6ConfigNrStorePipeRegs = 0; localparam CVA6ConfigNrLoadBufEntries = 2; @@ -74,7 +72,9 @@ package cva6_config_pkg; localparam config_pkg::cva6_user_cfg_t cva6_cfg = '{ XLEN: unsigned'(CVA6ConfigXlen), - FpgaEn: bit'(CVA6ConfigFpgaEn), + VLEN: unsigned'(64), + FpgaEn: bit'(0), // for Xilinx and Altera + FpgaAlteraEn: bit'(0), // for Altera (only) TechnoCut: bit'(0), SuperscalarEn: bit'(0), NrCommitPorts: unsigned'(1), diff --git a/core/include/cv64a6_mmu_config_pkg.sv b/core/include/cv64a6_mmu_config_pkg.sv index 9f30c220ab..8e0b686386 100644 --- a/core/include/cv64a6_mmu_config_pkg.sv +++ b/core/include/cv64a6_mmu_config_pkg.sv @@ -27,7 +27,9 @@ package cva6_config_pkg; localparam config_pkg::cva6_user_cfg_t cva6_cfg = '{ XLEN: unsigned'(CVA6ConfigXlen), + VLEN: unsigned'(64), FpgaEn: bit'(0), + FpgaAlteraEn: bit'(0), TechnoCut: bit'(0), SuperscalarEn: bit'(0), NrCommitPorts: unsigned'(1), diff --git a/core/include/cvxif_pkg.sv b/core/include/cvxif_pkg.sv deleted file mode 100644 index 57bd40b01e..0000000000 --- a/core/include/cvxif_pkg.sv +++ /dev/null @@ -1,110 +0,0 @@ -// Copyright 2021 Thales DIS design services SAS -// -// Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0 -// You may obtain a copy of the License at https://solderpad.org/licenses/ -// -// Original Author: Guillaume CHAUVON (guillaume.chauvon@thalesgroup.com) - -// Package for the CoreV-X-Interface for the CVA6 - -package cvxif_pkg; - - localparam X_DATAWIDTH = riscv::XLEN; - localparam X_NUM_RS = ariane_pkg::NR_RGPR_PORTS; //2 or 3 - localparam X_ID_WIDTH = $clog2(cva6_config_pkg::CVA6ConfigNrScoreboardEntries); - localparam X_MEM_WIDTH = 64; - localparam X_RFR_WIDTH = riscv::XLEN; - localparam X_RFW_WIDTH = riscv::XLEN; - - typedef struct packed { - logic [15:0] instr; - logic [1:0] mode; - logic [X_ID_WIDTH-1:0] id; - } x_compressed_req_t; - - typedef struct packed { - logic [31:0] instr; - logic accept; - } x_compressed_resp_t; - - typedef struct packed { - logic [31:0] instr; - logic [1:0] mode; - logic [X_ID_WIDTH-1:0] id; - logic [X_NUM_RS-1:0][X_RFR_WIDTH-1:0] rs; - logic [X_NUM_RS-1:0] rs_valid; - } x_issue_req_t; - - typedef struct packed { - logic accept; - logic writeback; - logic dualwrite; - logic dualread; - logic loadstore; - logic exc; - } x_issue_resp_t; - - typedef struct packed { - logic [X_ID_WIDTH-1:0] id; - logic x_commit_kill; - } x_commit_t; - - typedef struct packed { - logic [X_ID_WIDTH-1:0] id; - logic [31:0] addr; - logic [1:0] mode; - logic we; - logic [1:0] size; - logic [X_MEM_WIDTH-1:0] wdata; - logic last; - logic spec; - } x_mem_req_t; - - typedef struct packed { - logic exc; - logic [5:0] exccode; - } x_mem_resp_t; - - typedef struct packed { - logic [X_ID_WIDTH-1:0] id; - logic [X_MEM_WIDTH-1:0] rdata; - logic err; - } x_mem_result_t; - - typedef struct packed { - logic [X_ID_WIDTH-1:0] id; - logic [X_RFW_WIDTH-1:0] data; - logic [4:0] rd; - logic we; - logic exc; - logic [5:0] exccode; - } x_result_t; - - typedef struct packed { - logic x_compressed_valid; - x_compressed_req_t x_compressed_req; - logic x_issue_valid; - x_issue_req_t x_issue_req; - logic x_commit_valid; - x_commit_t x_commit; - logic x_mem_ready; - x_mem_resp_t x_mem_resp; - logic x_mem_result_valid; - x_mem_result_t x_mem_result; - logic x_result_ready; - } cvxif_req_t; - - typedef struct packed { - logic x_compressed_ready; - x_compressed_resp_t x_compressed_resp; - logic x_issue_ready; - x_issue_resp_t x_issue_resp; - logic x_mem_valid; - x_mem_req_t x_mem_req; - logic x_result_valid; - x_result_t x_result; - } cvxif_resp_t; - -endpackage diff --git a/core/include/cvxif_types.svh b/core/include/cvxif_types.svh new file mode 100644 index 0000000000..211742e05a --- /dev/null +++ b/core/include/cvxif_types.svh @@ -0,0 +1,73 @@ +`ifndef CVXIF_TYPES_SVH +`define CVXIF_TYPES_SVH + +//CVXIF +`define READREGFLAGS_T(Cfg) logic [Cfg.X_NUM_RS+Cfg.X_DUALREAD-1:0] +`define WRITEREGFLAGS_T(Cfg) logic [Cfg.X_DUALWRITE:0] +`define ID_T(Cfg) logic [Cfg.X_ID_WIDTH-1:0] +`define HARTID_T(Cfg) logic [Cfg.X_HARTID_WIDTH-1:0] + +`define X_COMPRESSED_REQ_T(Cfg, hartid_t) struct packed { \ + logic [15:0] instr; /*Offloaded compressed instruction*/ \ + hartid_t hartid; /*Identification of the hart offloading the instruction*/ \ +} +`define X_COMPRESSED_RESP_T(Cfg) struct packed { \ + logic [31:0] instr; /*Uncompressed instruction*/ \ + logic accept; /*Is the offloaded compressed instruction (id) accepted by the coprocessor?*/ \ +} + +`define X_ISSUE_REQ_T(Cfg, hartit_t, id_t) struct packed { \ + logic [31:0] instr; /*Offloaded instruction*/ \ + hartid_t hartid; /*Identification of the hart offloading the instruction*/ \ + id_t id; /*Identification of the offloaded instruction*/ \ +} +`define X_ISSUE_RESP_T(Cfg, writeregflags_t, readregflags_t) struct packed { \ + logic accept; /*Is the offloaded instruction (id) accepted by the coprocessor?*/ \ + writeregflags_t writeback; /*Will the coprocessor perform a writeback in the core to rd?*/ \ + readregflags_t register_read; /*Will the coprocessor perform require specific registers to be read?*/ \ +} + +`define X_REGISTER_T(Cfg, hartid_t, id_t, readregflags_t) struct packed { \ + hartid_t hartid; /*Identification of the hart offloading the instruction*/ \ + id_t id; /*Identification of the offloaded instruction*/ \ + logic [Cfg.X_NUM_RS-1:0][Cfg.X_RFR_WIDTH-1:0] rs; /*Register file source operands for the offloaded instruction.*/ \ + readregflags_t rs_valid; /*Validity of the register file source operand(s).*/ \ +} + +`define X_COMMIT_T(Cfg, hartid_t, id_t) struct packed { \ + hartid_t hartid; /*Identification of the hart offloading the instruction*/ \ + id_t id; /*Identification of the offloaded instruction*/ \ + logic commit_kill; /*Shall an offloaded instruction be killed?*/ \ +} + +`define X_RESULT_T(Cfg, hartid_t, id_t, writeregflags_t) struct packed { \ + hartid_t hartid; /*Identification of the hart offloading the instruction*/ \ + id_t id; /*Identification of the offloaded instruction*/ \ + logic [Cfg.X_RFW_WIDTH-1:0] data; /*Register file write data value(s)*/ \ + logic [4:0] rd; /*Register file destination address(es)*/ \ + writeregflags_t we; /*Register file write enable(s)*/ \ +} + +`define CVXIF_REQ_T(Cfg, x_compressed_req_t, x_issue_req_t, x_register_req_t, x_commit_t) struct packed { \ + logic compressed_valid; \ + x_compressed_req_t compressed_req; \ + logic issue_valid; \ + x_issue_req_t issue_req; \ + logic register_valid; \ + x_register_t register; \ + logic commit_valid; \ + x_commit_t commit; \ + logic result_ready; \ +} + +`define CVXIF_RESP_T(Cfg, x_compressed_resp_t, x_issue_resp_t, x_result_t) struct packed { \ + logic compressed_ready; \ + x_compressed_resp_t compressed_resp; \ + logic issue_ready; \ + x_issue_resp_t issue_resp; \ + logic register_ready; \ + logic result_valid; \ + x_result_t result; \ +} + +`endif // CVXIF_TYPES_SVH diff --git a/core/include/riscv_pkg.sv b/core/include/riscv_pkg.sv index 647ff4c7b8..40499c6371 100644 --- a/core/include/riscv_pkg.sv +++ b/core/include/riscv_pkg.sv @@ -23,7 +23,6 @@ package riscv; // FIXME stop using them from CoreV-Verif and HPDCache // Then remove them from this package localparam XLEN = cva6_config_pkg::CVA6ConfigXlen; - localparam VLEN = (XLEN == 32) ? 32 : 64; localparam PLEN = (XLEN == 32) ? 34 : 56; // -------------------- diff --git a/core/include/rvfi_types.svh b/core/include/rvfi_types.svh index df91ac5706..68881bf0d1 100644 --- a/core/include/rvfi_types.svh +++ b/core/include/rvfi_types.svh @@ -7,9 +7,9 @@ logic [config_pkg::NRET*64-1:0] order; \ logic [config_pkg::NRET*config_pkg::ILEN-1:0] insn; \ logic [config_pkg::NRET-1:0] trap; \ - logic [config_pkg::NRET*Cfg.XLEN-1:0] cause; \ + logic [config_pkg::NRET*Cfg.XLEN-1:0] cause; \ logic [config_pkg::NRET-1:0] halt; \ - logic [config_pkg::NRET-1:0] intr; \ + logic [config_pkg::NRET*Cfg.XLEN-1:0] intr; \ logic [config_pkg::NRET*2-1:0] mode; \ logic [config_pkg::NRET*2-1:0] ixl; \ logic [config_pkg::NRET*5-1:0] rs1_addr; \ diff --git a/core/instr_realign.sv b/core/instr_realign.sv index 9191285b6e..3aae6ff922 100644 --- a/core/instr_realign.sv +++ b/core/instr_realign.sv @@ -127,7 +127,7 @@ module instr_realign instr_o[2] = '0; addr_o[2] = '0; instr_o[3] = {16'b0, data_i[63:48]}; - addr_o[3] = {address_i[riscv::VLEN-1:3], 3'b110}; + addr_o[3] = {address_i[CVA6Cfg.VLEN-1:3], 3'b110}; case (address_i[2:1]) 2'b00: begin @@ -153,11 +153,11 @@ module instr_realign addr_o[0] = unaligned_address_q; instr_o[1] = data_i[47:16]; - addr_o[1] = {address_i[riscv::VLEN-1:3], 3'b010}; + addr_o[1] = {address_i[CVA6Cfg.VLEN-1:3], 3'b010}; if (instr_is_compressed[1]) begin instr_o[2] = data_i[63:32]; - addr_o[2] = {address_i[riscv::VLEN-1:3], 3'b100}; + addr_o[2] = {address_i[CVA6Cfg.VLEN-1:3], 3'b100}; valid_o[2] = valid_i; if (instr_is_compressed[2]) begin @@ -189,7 +189,7 @@ module instr_realign if (instr_is_compressed[0]) begin instr_o[1] = data_i[47:16]; - addr_o[1] = {address_i[riscv::VLEN-1:3], 3'b010}; + addr_o[1] = {address_i[CVA6Cfg.VLEN-1:3], 3'b010}; // 64 48 32 16 0 // | 3 | 2 | 1 | 0 | <- instruction slot @@ -200,7 +200,7 @@ module instr_realign // | * | C | C | C | C | -> aligned if (instr_is_compressed[1]) begin instr_o[2] = data_i[63:32]; - addr_o[2] = {address_i[riscv::VLEN-1:3], 3'b100}; + addr_o[2] = {address_i[CVA6Cfg.VLEN-1:3], 3'b100}; valid_o[2] = valid_i; if (instr_is_compressed[2]) begin @@ -231,7 +231,7 @@ module instr_realign // | * | C | C | I | // | * | I | I | instr_o[1] = data_i[63:32]; - addr_o[1] = {address_i[riscv::VLEN-1:3], 3'b100}; + addr_o[1] = {address_i[CVA6Cfg.VLEN-1:3], 3'b100}; instr_o[2] = instr_o[3]; addr_o[2] = addr_o[3]; @@ -262,15 +262,15 @@ module instr_realign // 000 110 100 010 <- unaligned address instr_o[0] = data_i[31:0]; - addr_o[0] = {address_i[riscv::VLEN-1:3], 3'b010}; + addr_o[0] = {address_i[CVA6Cfg.VLEN-1:3], 3'b010}; valid_o[0] = valid_i; instr_o[2] = data_i[63:32]; - addr_o[2] = {address_i[riscv::VLEN-1:3], 3'b110}; + addr_o[2] = {address_i[CVA6Cfg.VLEN-1:3], 3'b110}; if (instr_is_compressed[0]) begin instr_o[1] = data_i[47:16]; - addr_o[1] = {address_i[riscv::VLEN-1:3], 3'b100}; + addr_o[1] = {address_i[CVA6Cfg.VLEN-1:3], 3'b100}; valid_o[1] = valid_i; if (instr_is_compressed[1]) begin @@ -304,11 +304,11 @@ module instr_realign // 1000 110 100 <- unaligned address instr_o[0] = data_i[31:0]; - addr_o[0] = {address_i[riscv::VLEN-1:3], 3'b100}; + addr_o[0] = {address_i[CVA6Cfg.VLEN-1:3], 3'b100}; valid_o[0] = valid_i; instr_o[1] = data_i[47:16]; - addr_o[1] = {address_i[riscv::VLEN-1:3], 3'b110}; + addr_o[1] = {address_i[CVA6Cfg.VLEN-1:3], 3'b110}; if (instr_is_compressed[0]) begin if (instr_is_compressed[1]) begin @@ -330,7 +330,7 @@ module instr_realign // 1000 110 <- unaligned address instr_o[0] = data_i[31:0]; - addr_o[0] = {address_i[riscv::VLEN-1:3], 3'b110}; + addr_o[0] = {address_i[CVA6Cfg.VLEN-1:3], 3'b110}; if (instr_is_compressed[0]) begin valid_o[0] = valid_i; diff --git a/core/issue_read_operands.sv b/core/issue_read_operands.sv index f119c7a2b4..56045cdc5a 100644 --- a/core/issue_read_operands.sv +++ b/core/issue_read_operands.sv @@ -21,101 +21,105 @@ module issue_read_operands parameter type branchpredict_sbe_t = logic, parameter type fu_data_t = logic, parameter type scoreboard_entry_t = logic, - parameter type rs3_len_t = logic + parameter type forwarding_t = logic, + parameter type writeback_t = logic, + parameter type rs3_len_t = logic, + parameter type x_issue_req_t = logic, + parameter type x_issue_resp_t = logic, + parameter type x_register_t = logic, + parameter type x_commit_t = logic ) ( // Subsystem Clock - SUBSYSTEM input logic clk_i, // Asynchronous reset active low - SUBSYSTEM input logic rst_ni, - // Flush - CONTROLLER + // Prevent from issuing - CONTROLLER input logic flush_i, // Stall inserted by Acc dispatcher - ACC_DISPATCHER input logic stall_i, - // TO_BE_COMPLETED - TO_BE_COMPLETED + // Entry about the instruction to issue - SCOREBOARD input scoreboard_entry_t [CVA6Cfg.NrIssuePorts-1:0] issue_instr_i, - // TO_BE_COMPLETED - TO_BE_COMPLETED + // Instruction to issue - SCOREBOARD input logic [CVA6Cfg.NrIssuePorts-1:0][31:0] orig_instr_i, - // TO_BE_COMPLETED - TO_BE_COMPLETED + // Is there an instruction to issue - SCOREBOARD input logic [CVA6Cfg.NrIssuePorts-1:0] issue_instr_valid_i, - // Issue stage acknowledge - TO_BE_COMPLETED + // Issue stage acknowledge - SCOREBOARD output logic [CVA6Cfg.NrIssuePorts-1:0] issue_ack_o, - // rs1 operand address - scoreboard - output logic [CVA6Cfg.NrIssuePorts-1:0][REG_ADDR_SIZE-1:0] rs1_o, - // rs1 operand - scoreboard - input logic [CVA6Cfg.NrIssuePorts-1:0][CVA6Cfg.XLEN-1:0] rs1_i, - // rs1 operand is valid - scoreboard - input logic [CVA6Cfg.NrIssuePorts-1:0] rs1_valid_i, - // rs2 operand address - scoreboard - output logic [CVA6Cfg.NrIssuePorts-1:0][REG_ADDR_SIZE-1:0] rs2_o, - // rs2 operand - scoreboard - input logic [CVA6Cfg.NrIssuePorts-1:0][CVA6Cfg.XLEN-1:0] rs2_i, - // rs2 operand is valid - scoreboard - input logic [CVA6Cfg.NrIssuePorts-1:0] rs2_valid_i, - // rs3 operand address - scoreboard - output logic [CVA6Cfg.NrIssuePorts-1:0][REG_ADDR_SIZE-1:0] rs3_o, - // rs3 operand - scoreboard - input rs3_len_t [CVA6Cfg.NrIssuePorts-1:0] rs3_i, - // rs3 operand is valid - scoreboard - input logic [CVA6Cfg.NrIssuePorts-1:0] rs3_valid_i, - // get clobber input - // TO_BE_COMPLETED - TO_BE_COMPLETED - input fu_t [2**REG_ADDR_SIZE-1:0] rd_clobber_gpr_i, - // TO_BE_COMPLETED - TO_BE_COMPLETED - input fu_t [2**REG_ADDR_SIZE-1:0] rd_clobber_fpr_i, - // TO_BE_COMPLETED - TO_BE_COMPLETED + // Forwarding - SCOREBOARD + input forwarding_t fwd_i, + // FU data useful to execute instruction - EX_STAGE output fu_data_t [CVA6Cfg.NrIssuePorts-1:0] fu_data_o, - // Unregistered version of fu_data_o.operanda - TO_BE_COMPLETED + // Unregistered version of fu_data_o.operanda - EX_STAGE output logic [CVA6Cfg.NrIssuePorts-1:0][CVA6Cfg.XLEN-1:0] rs1_forwarding_o, - // Unregistered version of fu_data_o.operandb - TO_BE_COMPLETED + // Unregistered version of fu_data_o.operandb - EX_STAGE output logic [CVA6Cfg.NrIssuePorts-1:0][CVA6Cfg.XLEN-1:0] rs2_forwarding_o, - // Instruction pc - TO_BE_COMPLETED + // Program Counter - EX_STAGE output logic [CVA6Cfg.VLEN-1:0] pc_o, - // Is compressed instruction - TO_BE_COMPLETED + // Is compressed instruction - EX_STAGE output logic is_compressed_instr_o, - // Fixed Latency Unit ready to accept new request - TO_BE_COMPLETED + // Fixed Latency Unit is ready - EX_STAGE input logic flu_ready_i, - // ALU output is valid - TO_BE_COMPLETED + // ALU output is valid - EX_STAGE output logic [CVA6Cfg.NrIssuePorts-1:0] alu_valid_o, - // Branch instruction is valid - TO_BE_COMPLETED + // Branch unit is valid - EX_STAGE output logic [CVA6Cfg.NrIssuePorts-1:0] branch_valid_o, - // Transformed instruction - TO_BE_COMPLETED + // Transformed trap instruction - EX_STAGE output logic [CVA6Cfg.NrIssuePorts-1:0][31:0] tinst_o, - // TO_BE_COMPLETED - TO_BE_COMPLETED + // Information of branch prediction - EX_STAGE output branchpredict_sbe_t branch_predict_o, - // Load Store Unit is ready - TO_BE_COMPLETED + // Load store unit FU is ready - EX_STAGE input logic lsu_ready_i, - // Load Store Unit result is valid - TO_BE_COMPLETED + // Load store unit FU is valid - EX_STAGE output logic [CVA6Cfg.NrIssuePorts-1:0] lsu_valid_o, - // Mult result is valid - TO_BE_COMPLETED + // Mult FU is valid - EX_STAGE output logic [CVA6Cfg.NrIssuePorts-1:0] mult_valid_o, - // FPU is ready - TO_BE_COMPLETED + // FPU FU is ready - EX_STAGE input logic fpu_ready_i, - // FPU result is valid - TO_BE_COMPLETED + // FPU FU is valid - EX_STAGE output logic [CVA6Cfg.NrIssuePorts-1:0] fpu_valid_o, - // FPU fmt field from instruction - TO_BE_COMPLETED + // FPU fmt field - EX_STAGE output logic [1:0] fpu_fmt_o, - // FPU rm field from isntruction - TO_BE_COMPLETED + // FPU rm field - EX_STAGE output logic [2:0] fpu_rm_o, - // ALU output is valid - TO_BE_COMPLETED + // ALU2 FU is valid - EX_STAGE output logic [CVA6Cfg.NrIssuePorts-1:0] alu2_valid_o, - // CSR result is valid - TO_BE_COMPLETED + // CSR is valid - EX_STAGE output logic [CVA6Cfg.NrIssuePorts-1:0] csr_valid_o, - // CVXIF result is valid - TO_BE_COMPLETED + // CVXIF FU is valid - EX_STAGE output logic [CVA6Cfg.NrIssuePorts-1:0] cvxif_valid_o, - // CVXIF is ready - TO_BE_COMPLETED + // CVXIF is FU ready - EX_STAGE input logic cvxif_ready_i, - // CVXIF offloaded instruction - TO_BE_COMPLETED + // CVXIF offloader instruction value - EX_STAGE output logic [31:0] cvxif_off_instr_o, - // TO_BE_COMPLETED - TO_BE_COMPLETED + // CVA6 Hart ID - SUBSYSTEM + input logic [CVA6Cfg.XLEN-1:0] hart_id_i, + // CVXIF Issue interface + input logic x_issue_ready_i, + input x_issue_resp_t x_issue_resp_i, + output logic x_issue_valid_o, + output x_issue_req_t x_issue_req_o, + // CVXIF Register interface + input logic x_register_ready_i, + output logic x_register_valid_o, + output x_register_t x_register_o, + // CVXIF Commit interface + output logic x_commit_valid_o, + output x_commit_t x_commit_o, + // Writeback Handling of CVXIF + output logic x_transaction_accepted_o, + output logic x_transaction_rejected_o, + output logic x_issue_writeback_o, + output logic [CVA6Cfg.TRANS_ID_BITS-1:0] x_id_o, + // Destination register in the register file - COMMIT_STAGE input logic [CVA6Cfg.NrCommitPorts-1:0][4:0] waddr_i, - // TO_BE_COMPLETED - TO_BE_COMPLETED + // Value to write to register file - COMMIT_STAGE input logic [CVA6Cfg.NrCommitPorts-1:0][CVA6Cfg.XLEN-1:0] wdata_i, - // TO_BE_COMPLETED - TO_BE_COMPLETED + // GPR write enable - COMMIT_STAGE input logic [CVA6Cfg.NrCommitPorts-1:0] we_gpr_i, - // TO_BE_COMPLETED - TO_BE_COMPLETED + // FPR write enable - COMMIT_STAGE input logic [CVA6Cfg.NrCommitPorts-1:0] we_fpr_i, - // Stall signal, we do not want to fetch any more entries - TO_BE_COMPLETED + // Issue stall - PERF_COUNTERS output logic stall_issue_o ); @@ -125,9 +129,10 @@ module issue_read_operands logic none, load, store, alu, alu2, ctrl_flow, mult, csr, fpu, fpu_vec, cvxif, accel; } fus_busy_t; - logic [CVA6Cfg.NrIssuePorts-1:0] stall; + logic [CVA6Cfg.NrIssuePorts-1:0] stall_raw, stall_waw, stall_rs1, stall_rs2, stall_rs3; logic [CVA6Cfg.NrIssuePorts-1:0] fu_busy; // functional unit is busy fus_busy_t [CVA6Cfg.NrIssuePorts-1:0] fus_busy; // which functional units are considered busy + logic [CVA6Cfg.NrIssuePorts-1:0] issue_ack; // operands coming from regfile logic [CVA6Cfg.NrIssuePorts-1:0][CVA6Cfg.XLEN-1:0] operand_a_regfile, operand_b_regfile; // third operand from fp regfile or gp regfile if NR_RGPR_PORTS == 3 @@ -135,19 +140,50 @@ module issue_read_operands rs3_len_t operand_c_fpr; // output flipflop (ID <-> EX) fu_data_t [CVA6Cfg.NrIssuePorts-1:0] fu_data_n, fu_data_q; - logic [ CVA6Cfg.XLEN-1:0] imm_forward_rs3; - - logic [CVA6Cfg.NrIssuePorts-1:0] alu_valid_q; - logic [CVA6Cfg.NrIssuePorts-1:0] mult_valid_q; - logic [CVA6Cfg.NrIssuePorts-1:0] fpu_valid_q; - logic [ 1:0] fpu_fmt_q; - logic [ 2:0] fpu_rm_q; - logic [CVA6Cfg.NrIssuePorts-1:0] alu2_valid_q; - logic [CVA6Cfg.NrIssuePorts-1:0] lsu_valid_q; - logic [CVA6Cfg.NrIssuePorts-1:0] csr_valid_q; - logic [CVA6Cfg.NrIssuePorts-1:0] branch_valid_q; - logic [CVA6Cfg.NrIssuePorts-1:0] cvxif_valid_q; - logic [ 31:0] cvxif_off_instr_q; + logic [ CVA6Cfg.XLEN-1:0] imm_forward_rs3; + + logic [CVA6Cfg.NrIssuePorts-1:0] alu_valid_q; + logic [CVA6Cfg.NrIssuePorts-1:0] mult_valid_q; + logic [CVA6Cfg.NrIssuePorts-1:0] fpu_valid_q; + logic [ 1:0] fpu_fmt_q; + logic [ 2:0] fpu_rm_q; + logic [CVA6Cfg.NrIssuePorts-1:0] alu2_valid_q; + logic [CVA6Cfg.NrIssuePorts-1:0] lsu_valid_q; + logic [CVA6Cfg.NrIssuePorts-1:0] csr_valid_q; + logic [CVA6Cfg.NrIssuePorts-1:0] branch_valid_q; + logic [CVA6Cfg.NrIssuePorts-1:0] cvxif_valid_q; + logic [ 31:0] cvxif_off_instr_q; + logic cvxif_instruction_valid; + + //fwd logic + logic [CVA6Cfg.NrIssuePorts-1:0] rs1_has_raw; + logic [CVA6Cfg.NrIssuePorts-1:0] rs2_has_raw; + logic [CVA6Cfg.NrIssuePorts-1:0] rs3_has_raw; + + logic [CVA6Cfg.NrIssuePorts-1:0][CVA6Cfg.XLEN-1:0] rs3; + logic [CVA6Cfg.NrIssuePorts-1:0] rs3_fpr; + + logic [CVA6Cfg.NrIssuePorts-1:0] rs1_valid; + logic [CVA6Cfg.NrIssuePorts-1:0] rs2_valid; + logic [CVA6Cfg.NrIssuePorts-1:0] rs3_valid; + + logic [CVA6Cfg.NrIssuePorts-1:0][CVA6Cfg.XLEN-1:0] rs1_res; + logic [CVA6Cfg.NrIssuePorts-1:0][CVA6Cfg.XLEN-1:0] rs2_res; + logic [CVA6Cfg.NrIssuePorts-1:0][CVA6Cfg.XLEN-1:0] rs3_res; + + // clobber + fu_t [2**ariane_pkg::REG_ADDR_SIZE-1:0] rd_clobber_gpr, rd_clobber_fpr; + logic [2**ariane_pkg::REG_ADDR_SIZE-1:0][CVA6Cfg.NR_SB_ENTRIES:0] gpr_clobber_vld; + logic [2**ariane_pkg::REG_ADDR_SIZE-1:0][CVA6Cfg.NR_SB_ENTRIES:0] fpr_clobber_vld; + ariane_pkg::fu_t [ CVA6Cfg.NR_SB_ENTRIES:0] clobber_fu; + + //forward logic + logic [CVA6Cfg.NrIssuePorts-1:0][CVA6Cfg.NR_SB_ENTRIES+CVA6Cfg.NrWbPorts-1:0] + rs1_fwd_req, rs2_fwd_req, rs3_fwd_req; + logic [CVA6Cfg.NrIssuePorts-1:0] rs1_is_not_gpr0, rs2_is_not_gpr0, rs3_is_not_gpr0; + logic [CVA6Cfg.NrIssuePorts-1:0][CVA6Cfg.NR_SB_ENTRIES+CVA6Cfg.NrWbPorts-1:0][CVA6Cfg.XLEN-1:0] rs_data; + logic [CVA6Cfg.NrIssuePorts-1:0] rs1_available, rs2_available, rs3_available; + logic [CVA6Cfg.NrIssuePorts-1:0][31:0] tinst_n, tinst_q; // transformed instruction @@ -158,6 +194,56 @@ module issue_read_operands riscv::instruction_t orig_instr; assign orig_instr = riscv::instruction_t'(orig_instr_i[0]); + // CVXIF Signals + logic cvxif_req_allowed; + logic x_transaction_rejected; + logic [OPERANDS_PER_INSTR-1:0] rs_valid; + logic [OPERANDS_PER_INSTR-1:0][CVA6Cfg.XLEN-1:0] rs; + + cvxif_issue_register_commit_if_driver #( + .CVA6Cfg (CVA6Cfg), + .x_issue_req_t (x_issue_req_t), + .x_issue_resp_t(x_issue_resp_t), + .x_register_t (x_register_t), + .x_commit_t (x_commit_t) + ) i_cvxif_issue_register_commit_if_driver ( + .clk_i (clk_i), + .rst_ni (rst_ni), + .flush_i (flush_i), + .hart_id_i (hart_id_i), + .issue_ready_i (x_issue_ready_i), + .issue_resp_i (x_issue_resp_i), + .issue_valid_o (x_issue_valid_o), + .issue_req_o (x_issue_req_o), + .register_ready_i(x_register_ready_i), + .register_valid_o(x_register_valid_o), + .register_o (x_register_o), + .commit_valid_o (x_commit_valid_o), + .commit_o (x_commit_o), + .valid_i (cvxif_instruction_valid), + .x_off_instr_i (orig_instr_i[0]), + .x_trans_id_i (issue_instr_i[0].trans_id), + .register_i (rs), + .rs_valid_i (rs_valid), + .cvxif_busy_o () + ); + if (OPERANDS_PER_INSTR == 3) begin + assign rs_valid = {~stall_rs3[0], ~stall_rs2[0], ~stall_rs1[0]}; + assign rs = {fu_data_n[0].imm, fu_data_n[0].operand_b, fu_data_n[0].operand_a}; + end else begin + assign rs_valid = {~stall_rs2[0], ~stall_rs1[0]}; + assign rs = {fu_data_n[0].operand_b, fu_data_n[0].operand_a}; + end + + // TODO check only for 1st instruction ?? + // Allow a cvxif transaction if we WaW condition are ok. + assign cvxif_req_allowed = (issue_instr_i[0].fu == CVXIF) && !stall_waw[0]; + assign cvxif_instruction_valid = !issue_instr_i[0].ex.valid && issue_instr_valid_i[0] && cvxif_req_allowed; + assign x_transaction_accepted_o = x_issue_valid_o && x_issue_ready_i && x_issue_resp_i.accept; + assign x_transaction_rejected = x_issue_valid_o && x_issue_ready_i && ~x_issue_resp_i.accept; + assign x_issue_writeback_o = x_issue_resp_i.writeback; + assign x_id_o = x_issue_req_o.id; + // ID <-> EX registers for (genvar i = 0; i < CVA6Cfg.NrIssuePorts; i++) begin @@ -177,7 +263,7 @@ module issue_read_operands assign alu2_valid_o = alu2_valid_q; assign cvxif_valid_o = CVA6Cfg.CvxifEn ? cvxif_valid_q : '0; assign cvxif_off_instr_o = CVA6Cfg.CvxifEn ? cvxif_off_instr_q : '0; - assign stall_issue_o = stall[0]; + assign stall_issue_o = stall_raw[0]; assign tinst_o = CVA6Cfg.RVH ? tinst_q : '0; // --------------- // Issue Stage @@ -185,7 +271,9 @@ module issue_read_operands always_comb begin : structural_hazards fus_busy = '0; - + // CVXIF is always ready to try a new transaction on 1st issue port + // If a transaction is already pending then we stall until the transaction is done.(issue_ack_o[0] = 0) + // Since we can not have two CVXIF instruction on 1st issue port, CVXIF is always ready for the pending instruction. if (!flu_ready_i) begin fus_busy[0].alu = 1'b1; fus_busy[0].ctrl_flow = 1'b1; @@ -212,15 +300,13 @@ module issue_read_operands fus_busy[0].store = 1'b1; end - if (!cvxif_ready_i) begin - fus_busy[0].cvxif = 1'b1; - end - if (CVA6Cfg.SuperscalarEn) begin fus_busy[1] = fus_busy[0]; // Never issue CSR instruction on second issue port. fus_busy[1].csr = 1'b1; + // Never issue CVXIF instruction on second issue port. + fus_busy[1].cvxif = 1'b1; unique case (issue_instr_i[0].fu) NONE: fus_busy[1].none = 1'b1; @@ -272,7 +358,7 @@ module issue_read_operands fus_busy[1].load = 1'b1; fus_busy[1].store = 1'b1; end - CVXIF: fus_busy[1].cvxif = 1'b1; + CVXIF: ; endcase end end @@ -293,92 +379,315 @@ module issue_read_operands CTRL_FLOW: fu_busy[i] = fus_busy[i].ctrl_flow; CSR: fu_busy[i] = fus_busy[i].csr; MULT: fu_busy[i] = fus_busy[i].mult; - FPU: fu_busy[i] = fus_busy[i].fpu; - FPU_VEC: fu_busy[i] = fus_busy[i].fpu_vec; LOAD: fu_busy[i] = fus_busy[i].load; STORE: fu_busy[i] = fus_busy[i].store; CVXIF: fu_busy[i] = fus_busy[i].cvxif; - default: fu_busy[i] = 1'b0; + default: + if (CVA6Cfg.FpPresent) begin + unique case (issue_instr_i[i].fu) + FPU: fu_busy[i] = fus_busy[i].fpu; + FPU_VEC: fu_busy[i] = fus_busy[i].fpu_vec; + default: fu_busy[i] = 1'b0; + endcase + end else begin + fu_busy[i] = 1'b0; + end endcase end end + // ------------------- + // RD clobber process + // ------------------- + // rd_clobber output: output currently clobbered destination registers + + always_comb begin : clobber_assign + gpr_clobber_vld = '0; + fpr_clobber_vld = '0; + + // default (highest entry hast lowest prio in arbiter tree below) + clobber_fu[CVA6Cfg.NR_SB_ENTRIES] = ariane_pkg::NONE; + for (int unsigned i = 0; i < 2 ** ariane_pkg::REG_ADDR_SIZE; i++) begin + gpr_clobber_vld[i][CVA6Cfg.NR_SB_ENTRIES] = 1'b1; + fpr_clobber_vld[i][CVA6Cfg.NR_SB_ENTRIES] = 1'b1; + end + + // check for all valid entries and set the clobber accordingly + + for (int unsigned i = 0; i < CVA6Cfg.NR_SB_ENTRIES; i++) begin + gpr_clobber_vld[fwd_i.sbe[i].rd][i] = fwd_i.still_issued[i] & ~(CVA6Cfg.FpPresent && ariane_pkg::is_rd_fpr( + fwd_i.sbe[i].op)); + fpr_clobber_vld[fwd_i.sbe[i].rd][i] = fwd_i.still_issued[i] & (CVA6Cfg.FpPresent && ariane_pkg::is_rd_fpr( + fwd_i.sbe[i].op)); + clobber_fu[i] = fwd_i.sbe[i].fu; + end + + // GPR[0] is always free + gpr_clobber_vld[0] = '0; + end + + for (genvar k = 0; k < 2 ** ariane_pkg::REG_ADDR_SIZE; k++) begin : gen_sel_clobbers + // get fu that is going to clobber this register (there should be only one) + rr_arb_tree #( + .NumIn(CVA6Cfg.NR_SB_ENTRIES + 1), + .DataType(ariane_pkg::fu_t), + .ExtPrio(1'b1), + .AxiVldRdy(1'b1) + ) i_sel_gpr_clobbers ( + .clk_i (clk_i), + .rst_ni (rst_ni), + .flush_i(1'b0), + .rr_i ('0), + .req_i (gpr_clobber_vld[k]), + .gnt_o (), + .data_i (clobber_fu), + .gnt_i (1'b1), + .req_o (), + .data_o (rd_clobber_gpr[k]), + .idx_o () + ); + if (CVA6Cfg.FpPresent) begin + rr_arb_tree #( + .NumIn(CVA6Cfg.NR_SB_ENTRIES + 1), + .DataType(ariane_pkg::fu_t), + .ExtPrio(1'b1), + .AxiVldRdy(1'b1) + ) i_sel_fpr_clobbers ( + .clk_i (clk_i), + .rst_ni (rst_ni), + .flush_i(1'b0), + .rr_i ('0), + .req_i (fpr_clobber_vld[k]), + .gnt_o (), + .data_i (clobber_fu), + .gnt_i (1'b1), + .req_o (), + .data_o (rd_clobber_fpr[k]), + .idx_o () + ); + end else begin + assign rd_clobber_fpr[k] = NONE; + end + end + + // ---------------------------------- + // Read Operands (a.k.a forwarding) + // ---------------------------------- + // read operand interface: same logic as register file + + // WB ports have higher prio than entries + for (genvar i = 0; i < CVA6Cfg.NrIssuePorts; i++) begin + for (genvar k = 0; unsigned'(k) < CVA6Cfg.NrWbPorts; k++) begin : gen_rs_wb + + assign rs1_fwd_req[i][k] = (fwd_i.sbe[fwd_i.wb[k].trans_id].rd == issue_instr_i[i].rs1) & (fwd_i.still_issued[fwd_i.wb[k].trans_id]) & fwd_i.wb[k].valid & (~fwd_i.wb[k].ex_valid) & ((CVA6Cfg.FpPresent && ariane_pkg::is_rd_fpr( + fwd_i.sbe[fwd_i.wb[k].trans_id].op + )) == (CVA6Cfg.FpPresent && ariane_pkg::is_rs1_fpr( + issue_instr_i[i].op + ))); + + assign rs2_fwd_req[i][k] = (fwd_i.sbe[fwd_i.wb[k].trans_id].rd == issue_instr_i[i].rs2) & (fwd_i.still_issued[fwd_i.wb[k].trans_id]) & fwd_i.wb[k].valid & (~fwd_i.wb[k].ex_valid) & ((CVA6Cfg.FpPresent && ariane_pkg::is_rd_fpr( + fwd_i.sbe[fwd_i.wb[k].trans_id].op + )) == (CVA6Cfg.FpPresent && ariane_pkg::is_rs2_fpr( + issue_instr_i[i].op + ))); + + assign rs3_fwd_req[i][k] = (fwd_i.sbe[fwd_i.wb[k].trans_id].rd == issue_instr_i[i].result[ariane_pkg::REG_ADDR_SIZE-1:0]) & (fwd_i.still_issued[fwd_i.wb[k].trans_id]) & fwd_i.wb[k].valid & (~fwd_i.wb[k].ex_valid) & ((CVA6Cfg.FpPresent && ariane_pkg::is_rd_fpr( + fwd_i.sbe[fwd_i.wb[k].trans_id].op + )) == (CVA6Cfg.FpPresent && ariane_pkg::is_imm_fpr( + issue_instr_i[i].op + ))); + + assign rs_data[i][k] = fwd_i.wb[k].data; + end + + for (genvar k = 0; unsigned'(k) < CVA6Cfg.NR_SB_ENTRIES; k++) begin : gen_rs_entries + + assign rs1_fwd_req[i][k+CVA6Cfg.NrWbPorts] = (fwd_i.sbe[k].rd == issue_instr_i[i].rs1) & fwd_i.still_issued[k] & fwd_i.sbe[k].valid & ((CVA6Cfg.FpPresent && ariane_pkg::is_rd_fpr( + fwd_i.sbe[k].op + )) == (CVA6Cfg.FpPresent && ariane_pkg::is_rs1_fpr( + issue_instr_i[i].op + ))); + + assign rs2_fwd_req[i][k+CVA6Cfg.NrWbPorts] = (fwd_i.sbe[k].rd == issue_instr_i[i].rs2) & fwd_i.still_issued[k] & fwd_i.sbe[k].valid & ((CVA6Cfg.FpPresent && ariane_pkg::is_rd_fpr( + fwd_i.sbe[k].op + )) == (CVA6Cfg.FpPresent && ariane_pkg::is_rs2_fpr( + issue_instr_i[i].op + ))); + + assign rs3_fwd_req[i][k+CVA6Cfg.NrWbPorts] = (fwd_i.sbe[k].rd == issue_instr_i[i].result[ariane_pkg::REG_ADDR_SIZE-1:0]) & fwd_i.still_issued[k] & fwd_i.sbe[k].valid & ((CVA6Cfg.FpPresent && ariane_pkg::is_rd_fpr( + fwd_i.sbe[k].op + )) == (CVA6Cfg.FpPresent && ariane_pkg::is_imm_fpr( + issue_instr_i[i].op + ))); + + assign rs_data[i][k+CVA6Cfg.NrWbPorts] = fwd_i.sbe[k].result; + end + + // use fixed prio here + // this implicitly gives higher prio to WB ports + rr_arb_tree #( + .NumIn(CVA6Cfg.NR_SB_ENTRIES + CVA6Cfg.NrWbPorts), + .DataWidth(CVA6Cfg.XLEN), + .ExtPrio(1'b1), + .AxiVldRdy(1'b1) + ) i_sel_rs1 ( + .clk_i (clk_i), + .rst_ni (rst_ni), + .flush_i(1'b0), + .rr_i ('0), + .req_i (rs1_fwd_req[i]), + .gnt_o (), + .data_i (rs_data[i]), + .gnt_i (1'b1), + .req_o (rs1_available[i]), + .data_o (rs1_res[i]), + .idx_o () + ); + + rr_arb_tree #( + .NumIn(CVA6Cfg.NR_SB_ENTRIES + CVA6Cfg.NrWbPorts), + .DataWidth(CVA6Cfg.XLEN), + .ExtPrio(1'b1), + .AxiVldRdy(1'b1) + ) i_sel_rs2 ( + .clk_i (clk_i), + .rst_ni (rst_ni), + .flush_i(1'b0), + .rr_i ('0), + .req_i (rs2_fwd_req[i]), + .gnt_o (), + .data_i (rs_data[i]), + .gnt_i (1'b1), + .req_o (rs2_available[i]), + .data_o (rs2_res[i]), + .idx_o () + ); + + + rr_arb_tree #( + .NumIn(CVA6Cfg.NR_SB_ENTRIES + CVA6Cfg.NrWbPorts), + .DataWidth(CVA6Cfg.XLEN), + .ExtPrio(1'b1), + .AxiVldRdy(1'b1) + ) i_sel_rs3 ( + .clk_i (clk_i), + .rst_ni (rst_ni), + .flush_i(1'b0), + .rr_i ('0), + .req_i (rs3_fwd_req[i]), + .gnt_o (), + .data_i (rs_data[i]), + .gnt_i (1'b1), + .req_o (rs3_available[i]), + .data_o (rs3[i]), + .idx_o () + ); + + if (CVA6Cfg.NrRgprPorts == 3) begin : gen_gp_three_port + assign rs3_res[i] = rs3[i][riscv::XLEN-1:0]; + end else begin : gen_fp_three_port + assign rs3_res[i] = rs3[i][CVA6Cfg.FLen-1:0]; + end + + assign rs1_has_raw[i] = !issue_instr_i[i].use_zimm && ((CVA6Cfg.FpPresent && is_rs1_fpr( + issue_instr_i[i].op + )) ? rd_clobber_fpr[issue_instr_i[i].rs1] != NONE : + rd_clobber_gpr[issue_instr_i[i].rs1] != NONE); + + assign rs1_valid[i] = rs1_available[i] && (CVA6Cfg.FpPresent && is_rs1_fpr( + issue_instr_i[i].op + ) ? 1'b1 : ((rd_clobber_gpr[issue_instr_i[i].rs1] != CSR) || + (CVA6Cfg.RVS && issue_instr_i[i].op == SFENCE_VMA))); + + assign rs2_has_raw[i] = ((CVA6Cfg.FpPresent && is_rs2_fpr( + issue_instr_i[i].op + )) ? rd_clobber_fpr[issue_instr_i[i].rs2] != NONE : + rd_clobber_gpr[issue_instr_i[i].rs2] != NONE); + + assign rs2_valid[i] = rs2_available[i] && (CVA6Cfg.FpPresent && is_rs2_fpr( + issue_instr_i[i].op + ) ? 1'b1 : ((rd_clobber_gpr[issue_instr_i[i].rs2] != CSR) || + (CVA6Cfg.RVS && issue_instr_i[i].op == SFENCE_VMA))); + + assign rs3_has_raw[i] = ((CVA6Cfg.FpPresent && is_imm_fpr( + issue_instr_i[i].op + )) ? rd_clobber_fpr[issue_instr_i[i].result[REG_ADDR_SIZE-1:0]] != NONE : 0); + + assign rs3_valid[i] = rs3_available[i]; + assign rs3_fpr[i] = (CVA6Cfg.FpPresent && ariane_pkg::is_imm_fpr(issue_instr_i[i].op)); + + end + // --------------- // Register stage // --------------- // check that all operands are available, otherwise stall // forward corresponding register always_comb begin : operands_available - stall = '{default: stall_i}; + stall_raw = '{default: stall_i}; + stall_rs1 = '{default: stall_i}; + stall_rs2 = '{default: stall_i}; + stall_rs3 = '{default: stall_i}; // operand forwarding signals forward_rs1 = '0; forward_rs2 = '0; forward_rs3 = '0; // FPR only for (int unsigned i = 0; i < CVA6Cfg.NrIssuePorts; i++) begin - // poll the scoreboard for those values - rs1_o[i] = issue_instr_i[i].rs1; - rs2_o[i] = issue_instr_i[i].rs2; - rs3_o[i] = issue_instr_i[i].result[REG_ADDR_SIZE-1:0]; // rs3 is encoded in imm field - - // 0. check that we are not using the zimm type in RS1 - // as this is an immediate we do not have to wait on anything here - // 1. check if the source registers are clobbered --> check appropriate clobber list (gpr/fpr) - // 2. poll the scoreboard - if (!issue_instr_i[i].use_zimm && ((CVA6Cfg.FpPresent && is_rs1_fpr( - issue_instr_i[i].op - )) ? rd_clobber_fpr_i[issue_instr_i[i].rs1] != NONE : - rd_clobber_gpr_i[issue_instr_i[i].rs1] != NONE)) begin - // check if the clobbering instruction is not a CSR instruction, CSR instructions can only - // be fetched through the register file since they can't be forwarded - // if the operand is available, forward it. CSRs don't write to/from FPR - if (rs1_valid_i[i] && (CVA6Cfg.FpPresent && is_rs1_fpr( - issue_instr_i[i].op - ) ? 1'b1 : ((rd_clobber_gpr_i[issue_instr_i[i].rs1] != CSR) || - (CVA6Cfg.RVS && issue_instr_i[i].op == SFENCE_VMA)))) begin + if (rs1_has_raw[i]) begin + if (rs1_valid[i]) begin forward_rs1[i] = 1'b1; end else begin // the operand is not available -> stall - stall[i] = 1'b1; + stall_raw[i] = 1'b1; + stall_rs1[i] = 1'b1; end end - if ((CVA6Cfg.FpPresent && is_rs2_fpr( - issue_instr_i[i].op - )) ? rd_clobber_fpr_i[issue_instr_i[i].rs2] != NONE : - rd_clobber_gpr_i[issue_instr_i[i].rs2] != NONE) begin - // if the operand is available, forward it. CSRs don't write to/from FPR - if (rs2_valid_i[i] && (CVA6Cfg.FpPresent && is_rs2_fpr( - issue_instr_i[i].op - ) ? 1'b1 : ((rd_clobber_gpr_i[issue_instr_i[i].rs2] != CSR) || - (CVA6Cfg.RVS && issue_instr_i[i].op == SFENCE_VMA)))) begin + if (rs2_has_raw[i]) begin + if (rs2_valid[i]) begin forward_rs2[i] = 1'b1; end else begin // the operand is not available -> stall - stall[i] = 1'b1; + stall_raw[i] = 1'b1; + stall_rs2[i] = 1'b1; end end - // Only check clobbered gpr for OFFLOADED instruction - if ((CVA6Cfg.FpPresent && is_imm_fpr( - issue_instr_i[i].op - )) ? rd_clobber_fpr_i[issue_instr_i[i].result[REG_ADDR_SIZE-1:0]] != NONE : - issue_instr_i[i].op == OFFLOAD && CVA6Cfg.NrRgprPorts == 3 ? - rd_clobber_gpr_i[issue_instr_i[i].result[REG_ADDR_SIZE-1:0]] != NONE : 0) begin - // if the operand is available, forward it. CSRs don't write to/from FPR so no need to check - if (rs3_valid_i[i]) begin + if (rs3_has_raw[i] && rs3_fpr[i]) begin + if (rs3_valid[i]) begin forward_rs3[i] = 1'b1; end else begin // the operand is not available -> stall - stall[i] = 1'b1; + stall_raw[i] = 1'b1; + stall_rs3[i] = 1'b1; end end end + if (CVA6Cfg.CvxifEn) begin + // Remove unecessary forward and stall in case source register is not needed by coprocessor. + if (x_issue_valid_o && x_issue_resp_i.accept) begin + if (~x_issue_resp_i.register_read[0]) begin + forward_rs1[0] = 1'b0; + stall_rs1[0] = 1'b0; + end + if (~x_issue_resp_i.register_read[1]) begin + forward_rs2[0] = 1'b0; + stall_rs2[0] = 1'b0; + end + if (OPERANDS_PER_INSTR == 3 && ~x_issue_resp_i.register_read[2]) begin + forward_rs3[0] = 1'b0; + stall_rs3[0] = 1'b0; + end + end + stall_raw[0] = stall_rs1[0] || stall_rs2[0] || stall_rs3[0]; + end + if (CVA6Cfg.SuperscalarEn) begin if (!issue_instr_i[1].use_zimm && (!CVA6Cfg.FpPresent || (is_rs1_fpr( issue_instr_i[1].op ) == is_rd_fpr( issue_instr_i[0].op ))) && issue_instr_i[1].rs1 == issue_instr_i[0].rd && issue_instr_i[1].rs1 != '0) begin - stall[1] = 1'b1; + stall_raw[1] = 1'b1; end if ((!CVA6Cfg.FpPresent || (is_rs2_fpr( @@ -386,7 +695,7 @@ module issue_read_operands ) == is_rd_fpr( issue_instr_i[0].op ))) && issue_instr_i[1].rs2 == issue_instr_i[0].rd && issue_instr_i[1].rs2 != '0) begin - stall[1] = 1'b1; + stall_raw[1] = 1'b1; end // Only check clobbered gpr for OFFLOADED instruction @@ -395,18 +704,18 @@ module issue_read_operands )) ? is_rd_fpr( issue_instr_i[0].op ) && issue_instr_i[0].rd == issue_instr_i[1].result[REG_ADDR_SIZE-1:0] : - issue_instr_i[1].op == OFFLOAD && CVA6Cfg.NrRgprPorts == 3 ? + issue_instr_i[1].op == OFFLOAD && OPERANDS_PER_INSTR == 3 ? issue_instr_i[0].rd == issue_instr_i[1].result[REG_ADDR_SIZE-1:0] : 1'b0) begin - stall[1] = 1'b1; + stall_raw[1] = 1'b1; end end end // third operand from fp regfile or gp regfile if NR_RGPR_PORTS == 3 - if (CVA6Cfg.NrRgprPorts == 3) begin : gen_gp_rs3 - assign imm_forward_rs3 = rs3_i[0]; + if (OPERANDS_PER_INSTR == 3) begin : gen_gp_rs3 + assign imm_forward_rs3 = rs3_res[0]; end else begin : gen_fp_rs3 - assign imm_forward_rs3 = {{CVA6Cfg.XLEN - CVA6Cfg.FLen{1'b0}}, rs3_i[0]}; + assign imm_forward_rs3 = {{CVA6Cfg.XLEN - CVA6Cfg.FLen{1'b0}}, rs3_res[0]}; end // Forwarding/Output MUX @@ -418,7 +727,7 @@ module issue_read_operands // immediates are the third operands in the store case // for FP operations, the imm field can also be the third operand from the regfile - if (CVA6Cfg.NrRgprPorts == 3) begin + if (OPERANDS_PER_INSTR == 3) begin fu_data_n[i].imm = (CVA6Cfg.FpPresent && is_imm_fpr(issue_instr_i[i].op)) ? {{CVA6Cfg.XLEN - CVA6Cfg.FLen{1'b0}}, operand_c_regfile[i]} : issue_instr_i[i].op == OFFLOAD ? operand_c_regfile[i] : issue_instr_i[i].result; @@ -435,12 +744,12 @@ module issue_read_operands // or should we forward if (forward_rs1[i]) begin - fu_data_n[i].operand_a = rs1_i[i]; + fu_data_n[i].operand_a = rs1_res[i]; end if (forward_rs2[i]) begin - fu_data_n[i].operand_b = rs2_i[i]; + fu_data_n[i].operand_b = rs2_res[i]; end - if (CVA6Cfg.FpPresent && forward_rs3[i]) begin + if ((CVA6Cfg.FpPresent || (CVA6Cfg.CvxifEn && OPERANDS_PER_INSTR == 3)) && forward_rs3[i]) begin fu_data_n[i].imm = imm_forward_rs3; end @@ -555,7 +864,7 @@ module issue_read_operands case (issue_instr_i[i].fu) CVXIF: begin cvxif_valid_q[i] <= 1'b1; - cvxif_off_instr_q <= orig_instr; + cvxif_off_instr_q <= orig_instr[i]; end default: ; endcase @@ -569,64 +878,69 @@ module issue_read_operands end end + always_comb begin : gen_check_waw_dependencies + stall_waw = '1; + for (int unsigned i = 0; i < CVA6Cfg.NrIssuePorts; i++) begin + if (issue_instr_valid_i[i] && !fu_busy[i]) begin + // ----------------------------------------- + // WAW - Write After Write Dependency Check + // ----------------------------------------- + // no other instruction has the same destination register -> issue the instruction + if ((CVA6Cfg.FpPresent && ariane_pkg::is_rd_fpr( + issue_instr_i[i].op + )) ? (rd_clobber_fpr[issue_instr_i[i].rd] == NONE) : + (rd_clobber_gpr[issue_instr_i[i].rd] == NONE)) begin + stall_waw[i] = 1'b0; + end + // or check that the target destination register will be written in this cycle by the + // commit stage + for (int unsigned c = 0; c < CVA6Cfg.NrCommitPorts; c++) begin + if ((CVA6Cfg.FpPresent && ariane_pkg::is_rd_fpr( + issue_instr_i[i].op + )) ? (we_fpr_i[c] && waddr_i[c] == issue_instr_i[i].rd) : + (we_gpr_i[c] && waddr_i[c] == issue_instr_i[i].rd)) begin + stall_waw[i] = 1'b0; + end + end + if (i > 0) begin + if ((issue_instr_i[i].rd == issue_instr_i[i-1].rd) && (issue_instr_i[i].rd != '0)) begin + stall_waw[i] = 1'b1; + end + end + end + end + end + + // We can issue an instruction if we do not detect that any other instruction is writing the same // destination register. // We also need to check if there is an unresolved branch in the scoreboard. always_comb begin : issue_scoreboard for (int unsigned i = 0; i < CVA6Cfg.NrIssuePorts; i++) begin // default assignment - issue_ack_o[i] = 1'b0; - // check that we didn't stall, that the instruction we got is valid + issue_ack[i] = 1'b0; + // check that the instruction we got is valid // and that the functional unit we need is not busy if (issue_instr_valid_i[i] && !fu_busy[i]) begin - // check that the corresponding functional unit is not busy - if (!stall[i]) begin - // ----------------------------------------- - // WAW - Write After Write Dependency Check - // ----------------------------------------- - // no other instruction has the same destination register -> issue the instruction - if ((CVA6Cfg.FpPresent && ariane_pkg::is_rd_fpr( - issue_instr_i[i].op - )) ? (rd_clobber_fpr_i[issue_instr_i[i].rd] == NONE) : - (rd_clobber_gpr_i[issue_instr_i[i].rd] == NONE)) begin - issue_ack_o[i] = 1'b1; - end - // or check that the target destination register will be written in this cycle by the - // commit stage - for (int unsigned c = 0; c < CVA6Cfg.NrCommitPorts; c++) begin - if ((CVA6Cfg.FpPresent && ariane_pkg::is_rd_fpr( - issue_instr_i[i].op - )) ? (we_fpr_i[c] && waddr_i[c] == issue_instr_i[i].rd[4:0]) : - (we_gpr_i[c] && waddr_i[c] == issue_instr_i[i].rd[4:0])) begin - issue_ack_o[i] = 1'b1; - end - end - if (i > 0) begin - if ((issue_instr_i[i].rd[4:0] == issue_instr_i[i-1].rd[4:0]) && (issue_instr_i[i].rd[4:0] != '0)) begin - issue_ack_o[i] = 1'b0; - end - end + if (!stall_raw[i] && !stall_waw[i]) begin + issue_ack[i] = 1'b1; end - // we can also issue the instruction under the following two circumstances: - // we can do this even if we are stalled or no functional unit is ready (as we don't need one) - // the decoder needs to make sure that the instruction is marked as valid when it does not - // need any functional unit or if an exception occurred previous to the execute stage. - // 1. we already got an exception if (issue_instr_i[i].ex.valid) begin - issue_ack_o[i] = 1'b1; - end - // 2. it is an instruction which does not need any functional unit - if (issue_instr_i[i].fu == NONE) begin - issue_ack_o[i] = 1'b1; + issue_ack[i] = 1'b1; end end end if (CVA6Cfg.SuperscalarEn) begin - if (!issue_ack_o[0]) begin - issue_ack_o[1] = 1'b0; + if (!issue_ack[0]) begin + issue_ack[1] = 1'b0; end end + issue_ack_o = issue_ack; + // Do not acknoledge the issued instruction if transaction is not completed. + if (issue_instr_i[0].fu == CVXIF && !(x_transaction_accepted_o || x_transaction_rejected)) begin + issue_ack_o[0] = issue_instr_i[0].ex.valid && issue_instr_valid_i[0]; + end end // ---------------------- @@ -641,8 +955,8 @@ module issue_read_operands logic [CVA6Cfg.NrCommitPorts-1:0] we_pack; for (genvar i = 0; i < CVA6Cfg.NrIssuePorts; i++) begin - assign raddr_pack[i*OPERANDS_PER_INSTR+0] = issue_instr_i[i].rs1[4:0]; - assign raddr_pack[i*OPERANDS_PER_INSTR+1] = issue_instr_i[i].rs2[4:0]; + assign raddr_pack[i*OPERANDS_PER_INSTR+0] = issue_instr_i[i].rs1; + assign raddr_pack[i*OPERANDS_PER_INSTR+1] = issue_instr_i[i].rs2; if (OPERANDS_PER_INSTR == 3) begin assign raddr_pack[i*OPERANDS_PER_INSTR+2] = issue_instr_i[i].result[4:0]; end @@ -660,13 +974,14 @@ module issue_read_operands .NR_READ_PORTS(CVA6Cfg.NrRgprPorts), .ZERO_REG_ZERO(1) ) i_ariane_regfile_fpga ( + .clk_i, + .rst_ni, .test_en_i(1'b0), .raddr_i (raddr_pack), .rdata_o (rdata), .waddr_i (waddr_pack), .wdata_i (wdata_pack), - .we_i (we_pack), - .* + .we_i (we_pack) ); end else begin : gen_asic_regfile ariane_regfile #( @@ -675,13 +990,14 @@ module issue_read_operands .NR_READ_PORTS(CVA6Cfg.NrRgprPorts), .ZERO_REG_ZERO(1) ) i_ariane_regfile ( + .clk_i, + .rst_ni, .test_en_i(1'b0), .raddr_i (raddr_pack), .rdata_o (rdata), .waddr_i (waddr_pack), .wdata_i (wdata_pack), - .we_i (we_pack), - .* + .we_i (we_pack) ); end @@ -720,13 +1036,14 @@ module issue_read_operands .NR_READ_PORTS(3), .ZERO_REG_ZERO(0) ) i_ariane_fp_regfile_fpga ( + .clk_i, + .rst_ni, .test_en_i(1'b0), .raddr_i (fp_raddr_pack), .rdata_o (fprdata), .waddr_i (waddr_pack), .wdata_i (fp_wdata_pack), - .we_i (we_fpr_i), - .* + .we_i (we_fpr_i) ); end else begin : gen_asic_fp_regfile ariane_regfile #( @@ -735,13 +1052,14 @@ module issue_read_operands .NR_READ_PORTS(3), .ZERO_REG_ZERO(0) ) i_ariane_fp_regfile ( + .clk_i, + .rst_ni, .test_en_i(1'b0), .raddr_i (fp_raddr_pack), .rdata_o (fprdata), .waddr_i (waddr_pack), .wdata_i (fp_wdata_pack), - .we_i (we_fpr_i), - .* + .we_i (we_fpr_i) ); end end else begin : no_fpr_gen @@ -749,14 +1067,14 @@ module issue_read_operands end endgenerate - if (CVA6Cfg.NrRgprPorts == 3) begin : gen_operand_c + if (OPERANDS_PER_INSTR == 3) begin : gen_operand_c assign operand_c_fpr = {{CVA6Cfg.XLEN - CVA6Cfg.FLen{1'b0}}, fprdata[2]}; end else begin assign operand_c_fpr = fprdata[2]; end for (genvar i = 0; i < CVA6Cfg.NrIssuePorts; i++) begin - if (CVA6Cfg.NrRgprPorts == 3) begin : gen_operand_c + if (OPERANDS_PER_INSTR == 3) begin : gen_operand_c assign operand_c_gpr[i] = rdata[i*OPERANDS_PER_INSTR+2]; end @@ -766,7 +1084,7 @@ module issue_read_operands assign operand_b_regfile[i] = (CVA6Cfg.FpPresent && is_rs2_fpr( issue_instr_i[i].op )) ? {{CVA6Cfg.XLEN - CVA6Cfg.FLen{1'b0}}, fprdata[1]} : rdata[i*OPERANDS_PER_INSTR+1]; - assign operand_c_regfile[i] = (CVA6Cfg.NrRgprPorts == 3) ? ((CVA6Cfg.FpPresent && is_imm_fpr( + assign operand_c_regfile[i] = (OPERANDS_PER_INSTR == 3) ? ((CVA6Cfg.FpPresent && is_imm_fpr( issue_instr_i[i].op )) ? operand_c_fpr : operand_c_gpr[i]) : operand_c_fpr; end @@ -780,9 +1098,10 @@ module issue_read_operands if (CVA6Cfg.RVH) begin tinst_q <= '0; end - pc_o <= '0; - is_compressed_instr_o <= 1'b0; - branch_predict_o <= {cf_t'(0), {CVA6Cfg.VLEN{1'b0}}}; + pc_o <= '0; + is_compressed_instr_o <= 1'b0; + branch_predict_o <= {cf_t'(0), {CVA6Cfg.VLEN{1'b0}}}; + x_transaction_rejected_o <= 1'b0; end else begin fu_data_q <= fu_data_n; if (CVA6Cfg.RVH) begin @@ -800,6 +1119,10 @@ module issue_read_operands is_compressed_instr_o <= issue_instr_i[0].is_compressed; branch_predict_o <= issue_instr_i[0].bp; end + x_transaction_rejected_o <= 1'b0; + if (issue_instr_i[0].fu == CVXIF) begin + x_transaction_rejected_o <= x_transaction_rejected; + end end end diff --git a/core/issue_stage.sv b/core/issue_stage.sv index d17f8f3af9..788cbe09b4 100644 --- a/core/issue_stage.sv +++ b/core/issue_stage.sv @@ -22,7 +22,12 @@ module issue_stage parameter type branchpredict_sbe_t = logic, parameter type exception_t = logic, parameter type fu_data_t = logic, - parameter type scoreboard_entry_t = logic + parameter type scoreboard_entry_t = logic, + parameter type writeback_t = logic, + parameter type x_issue_req_t = logic, + parameter type x_issue_resp_t = logic, + parameter type x_register_t = logic, + parameter type x_commit_t = logic ) ( // Subsystem Clock - SUBSYSTEM input logic clk_i, @@ -30,9 +35,9 @@ module issue_stage input logic rst_ni, // Is scoreboard full - PERF_COUNTERS output logic sb_full_o, - // TO_BE_COMPLETED - CONTROLLER + // Prevent from issuing - CONTROLLER input logic flush_unissued_instr_i, - // TO_BE_COMPLETED - CONTROLLER + // Flush whole scoreboard - CONTROLLER input logic flush_i, // Stall inserted by Acc dispatcher - ACC_DISPATCHER input logic stall_i, @@ -60,18 +65,18 @@ module issue_stage output logic [CVA6Cfg.NrIssuePorts-1:0][31:0] tinst_o, // Fixed Latency Unit is ready - EX_STAGE input logic flu_ready_i, - // ALU FU is valid - EX_STAGE + // ALU output is valid - EX_STAGE output logic [CVA6Cfg.NrIssuePorts-1:0] alu_valid_o, + // Branch unit is valid - EX_STAGE + output logic [CVA6Cfg.NrIssuePorts-1:0] branch_valid_o, + // Information of branch prediction - EX_STAGE + output branchpredict_sbe_t branch_predict_o, // Signaling that we resolved the branch - EX_STAGE input logic resolve_branch_i, // Load store unit FU is ready - EX_STAGE input logic lsu_ready_i, // Load store unit FU is valid - EX_STAGE output logic [CVA6Cfg.NrIssuePorts-1:0] lsu_valid_o, - // Branch unit is valid - EX_STAGE - output logic [CVA6Cfg.NrIssuePorts-1:0] branch_valid_o, - // Information of branch prediction - EX_STAGE - output branchpredict_sbe_t branch_predict_o, // Mult FU is valid - EX_STAGE output logic [CVA6Cfg.NrIssuePorts-1:0] mult_valid_o, // FPU FU is ready - EX_STAGE @@ -87,34 +92,58 @@ module issue_stage // CSR is valid - EX_STAGE output logic [CVA6Cfg.NrIssuePorts-1:0] csr_valid_o, // CVXIF FU is valid - EX_STAGE - output logic [CVA6Cfg.NrIssuePorts-1:0] x_issue_valid_o, + output logic [CVA6Cfg.NrIssuePorts-1:0] xfu_valid_o, // CVXIF is FU ready - EX_STAGE - input logic x_issue_ready_i, + input logic xfu_ready_i, // CVXIF offloader instruction value - EX_STAGE output logic [31:0] x_off_instr_o, + // CVA6 Hart ID - SUBSYSTEM + input logic [CVA6Cfg.XLEN-1:0] hart_id_i, + // CVXIF Issue interface - EX_STAGE + input logic x_issue_ready_i, + // TO_BE_COMPLETED - EX_STAGE + input x_issue_resp_t x_issue_resp_i, + // TO_BE_COMPLETED - EX_STAGE + output logic x_issue_valid_o, + // TO_BE_COMPLETED - EX_STAGE + output x_issue_req_t x_issue_req_o, + // CVXIF Register interface - EX_STAGE + input logic x_register_ready_i, + // TO_BE_COMPLETED - EX_STAGE + output logic x_register_valid_o, + // TO_BE_COMPLETED - EX_STAGE + output x_register_t x_register_o, + // CVXIF Commit interface - EX_STAGE + output logic x_commit_valid_o, + // TO_BE_COMPLETED - EX_STAGE + output x_commit_t x_commit_o, + // CVXIF Transaction rejected -> instruction is illegal - EX_STAGE + output logic x_transaction_rejected_o, // Issue scoreboard entry - ACC_DISPATCHER output scoreboard_entry_t issue_instr_o, // TO_BE_COMPLETED - ACC_DISPATCHER output logic issue_instr_hs_o, // Transaction ID - EX_STAGE input logic [CVA6Cfg.NrWbPorts-1:0][CVA6Cfg.TRANS_ID_BITS-1:0] trans_id_i, - // The branch engine uses the write back from the ALU - EX_STAGE + // Result from branch unit - EX_STAGE input bp_resolve_t resolved_branch_i, - // TO_BE_COMPLETED - EX_STAGE + // Results to write back - EX_STAGE input logic [CVA6Cfg.NrWbPorts-1:0][CVA6Cfg.XLEN-1:0] wbdata_i, // exception from execute stage or CVXIF - EX_STAGE input exception_t [CVA6Cfg.NrWbPorts-1:0] ex_ex_i, - // TO_BE_COMPLETED - EX_STAGE + // Indicates valid results - EX_STAGE input logic [CVA6Cfg.NrWbPorts-1:0] wt_valid_i, // CVXIF write enable - EX_STAGE input logic x_we_i, - // TO_BE_COMPLETED - EX_STAGE + // CVXIF destination register - EX_STAGE + input logic [4:0] x_rd_i, + // Destination register in register file - COMMIT_STAGE input logic [CVA6Cfg.NrCommitPorts-1:0][4:0] waddr_i, - // TO_BE_COMPLETED - EX_STAGE + // Value to write to register file - COMMIT_STAGE input logic [CVA6Cfg.NrCommitPorts-1:0][CVA6Cfg.XLEN-1:0] wdata_i, - // GPR write enable - EX_STAGE + // GPR write enable - COMMIT_STAGE input logic [CVA6Cfg.NrCommitPorts-1:0] we_gpr_i, - // FPR write enable - EX_STAGE + // FPR write enable - COMMIT_STAGE input logic [CVA6Cfg.NrCommitPorts-1:0] we_fpr_i, // Instructions to commit - COMMIT_STAGE output scoreboard_entry_t [CVA6Cfg.NrCommitPorts-1:0] commit_instr_o, @@ -133,29 +162,21 @@ module issue_stage // Scoreboard (SB) <-> Issue and Read Operands (IRO) // --------------------------------------------------- typedef logic [(CVA6Cfg.NrRgprPorts == 3 ? CVA6Cfg.XLEN : CVA6Cfg.FLen)-1:0] rs3_len_t; + typedef struct packed { + logic [CVA6Cfg.NR_SB_ENTRIES-1:0] still_issued; + logic [CVA6Cfg.TRANS_ID_BITS-1:0] issue_pointer; + writeback_t [CVA6Cfg.NrWbPorts-1:0] wb; + scoreboard_entry_t [CVA6Cfg.NR_SB_ENTRIES-1:0] sbe; + } forwarding_t; - fu_t [ 2**REG_ADDR_SIZE-1:0] rd_clobber_gpr_sb_iro; - fu_t [ 2**REG_ADDR_SIZE-1:0] rd_clobber_fpr_sb_iro; + forwarding_t fwd; + scoreboard_entry_t [CVA6Cfg.NrIssuePorts-1:0] issue_instr_sb_iro; + logic [CVA6Cfg.NrIssuePorts-1:0][ 31:0] orig_instr_sb_iro; + logic [CVA6Cfg.NrIssuePorts-1:0] issue_instr_valid_sb_iro; + logic [CVA6Cfg.NrIssuePorts-1:0] issue_ack_iro_sb; - logic [CVA6Cfg.NrIssuePorts-1:0][REG_ADDR_SIZE-1:0] rs1_iro_sb; - logic [CVA6Cfg.NrIssuePorts-1:0][ CVA6Cfg.XLEN-1:0] rs1_sb_iro; - logic [CVA6Cfg.NrIssuePorts-1:0] rs1_valid_sb_iro; - - logic [CVA6Cfg.NrIssuePorts-1:0][REG_ADDR_SIZE-1:0] rs2_iro_sb; - logic [CVA6Cfg.NrIssuePorts-1:0][ CVA6Cfg.XLEN-1:0] rs2_sb_iro; - logic [CVA6Cfg.NrIssuePorts-1:0] rs2_valid_iro_sb; - - logic [CVA6Cfg.NrIssuePorts-1:0][REG_ADDR_SIZE-1:0] rs3_iro_sb; - rs3_len_t [CVA6Cfg.NrIssuePorts-1:0] rs3_sb_iro; - logic [CVA6Cfg.NrIssuePorts-1:0] rs3_valid_iro_sb; - - scoreboard_entry_t [CVA6Cfg.NrIssuePorts-1:0] issue_instr_sb_iro; - logic [CVA6Cfg.NrIssuePorts-1:0][ 31:0] orig_instr_sb_iro; - logic [CVA6Cfg.NrIssuePorts-1:0] issue_instr_valid_sb_iro; - logic [CVA6Cfg.NrIssuePorts-1:0] issue_ack_iro_sb; - - logic [CVA6Cfg.NrIssuePorts-1:0][ CVA6Cfg.XLEN-1:0] rs1_forwarding_xlen; - logic [CVA6Cfg.NrIssuePorts-1:0][ CVA6Cfg.XLEN-1:0] rs2_forwarding_xlen; + logic [CVA6Cfg.NrIssuePorts-1:0][CVA6Cfg.XLEN-1:0] rs1_forwarding_xlen; + logic [CVA6Cfg.NrIssuePorts-1:0][CVA6Cfg.XLEN-1:0] rs2_forwarding_xlen; for (genvar i = 0; i < CVA6Cfg.NrIssuePorts; i++) begin assign rs1_forwarding_o[i] = rs1_forwarding_xlen[i][CVA6Cfg.VLEN-1:0]; @@ -165,6 +186,8 @@ module issue_stage assign issue_instr_o = issue_instr_sb_iro[0]; assign issue_instr_hs_o = issue_instr_valid_sb_iro[0] & issue_ack_iro_sb[0]; + logic x_transaction_accepted_iro_sb, x_issue_writeback_iro_sb; + logic [CVA6Cfg.TRANS_ID_BITS-1:0] x_id_iro_sb; // --------------------------------------------------------- // 2. Manage instructions in a scoreboard @@ -173,35 +196,40 @@ module issue_stage .CVA6Cfg (CVA6Cfg), .rs3_len_t (rs3_len_t), .bp_resolve_t(bp_resolve_t), + .writeback_t(writeback_t), + .forwarding_t(forwarding_t), .exception_t(exception_t), .scoreboard_entry_t(scoreboard_entry_t) ) i_scoreboard ( - .sb_full_o (sb_full_o), - .rd_clobber_gpr_o(rd_clobber_gpr_sb_iro), - .rd_clobber_fpr_o(rd_clobber_fpr_sb_iro), - .rs1_i (rs1_iro_sb), - .rs1_o (rs1_sb_iro), - .rs1_valid_o (rs1_valid_sb_iro), - .rs2_i (rs2_iro_sb), - .rs2_o (rs2_sb_iro), - .rs2_valid_o (rs2_valid_iro_sb), - .rs3_i (rs3_iro_sb), - .rs3_o (rs3_sb_iro), - .rs3_valid_o (rs3_valid_iro_sb), - - .decoded_instr_i (decoded_instr_i), - .decoded_instr_valid_i(decoded_instr_valid_i), - .decoded_instr_ack_o (decoded_instr_ack_o), - .issue_instr_o (issue_instr_sb_iro), - .orig_instr_o (orig_instr_sb_iro), - .issue_instr_valid_o (issue_instr_valid_sb_iro), - .issue_ack_i (issue_ack_iro_sb), - - .resolved_branch_i(resolved_branch_i), - .trans_id_i (trans_id_i), - .wbdata_i (wbdata_i), - .ex_i (ex_ex_i), - .* + .clk_i, + .rst_ni, + .sb_full_o (sb_full_o), + .flush_unissued_instr_i, + .flush_i, + .x_transaction_accepted_i(x_transaction_accepted_iro_sb), + .x_issue_writeback_i (x_issue_writeback_iro_sb), + .x_id_i (x_id_iro_sb), + .commit_instr_o, + .commit_drop_o, + .commit_ack_i, + .decoded_instr_i (decoded_instr_i), + .orig_instr_i, + .decoded_instr_valid_i (decoded_instr_valid_i), + .decoded_instr_ack_o (decoded_instr_ack_o), + .issue_instr_o (issue_instr_sb_iro), + .orig_instr_o (orig_instr_sb_iro), + .issue_instr_valid_o (issue_instr_valid_sb_iro), + .issue_ack_i (issue_ack_iro_sb), + .fwd_o (fwd), + .resolved_branch_i (resolved_branch_i), + .trans_id_i (trans_id_i), + .wbdata_i (wbdata_i), + .ex_i (ex_ex_i), + .wt_valid_i, + .x_we_i, + .x_rd_i, + .rvfi_issue_pointer_o, + .rvfi_commit_pointer_o ); // --------------------------------------------------------- @@ -212,39 +240,64 @@ module issue_stage .branchpredict_sbe_t(branchpredict_sbe_t), .fu_data_t(fu_data_t), .scoreboard_entry_t(scoreboard_entry_t), - .rs3_len_t(rs3_len_t) + .rs3_len_t(rs3_len_t), + .writeback_t(writeback_t), + .forwarding_t(forwarding_t), + .x_issue_req_t(x_issue_req_t), + .x_issue_resp_t(x_issue_resp_t), + .x_register_t(x_register_t), + .x_commit_t(x_commit_t) ) i_issue_read_operands ( - .flush_i (flush_unissued_instr_i), - .issue_instr_i (issue_instr_sb_iro), - .orig_instr_i (orig_instr_sb_iro), - .issue_instr_valid_i(issue_instr_valid_sb_iro), - .issue_ack_o (issue_ack_iro_sb), - .fu_data_o (fu_data_o), - .flu_ready_i (flu_ready_i), - .rs1_o (rs1_iro_sb), - .rs1_i (rs1_sb_iro), - .rs1_valid_i (rs1_valid_sb_iro), - .rs2_o (rs2_iro_sb), - .rs2_i (rs2_sb_iro), - .rs2_valid_i (rs2_valid_iro_sb), - .rs3_o (rs3_iro_sb), - .rs3_i (rs3_sb_iro), - .rs3_valid_i (rs3_valid_iro_sb), - .rd_clobber_gpr_i (rd_clobber_gpr_sb_iro), - .rd_clobber_fpr_i (rd_clobber_fpr_sb_iro), - .alu_valid_o (alu_valid_o), - .alu2_valid_o (alu2_valid_o), - .branch_valid_o (branch_valid_o), - .csr_valid_o (csr_valid_o), - .cvxif_valid_o (x_issue_valid_o), - .cvxif_ready_i (x_issue_ready_i), - .cvxif_off_instr_o (x_off_instr_o), - .mult_valid_o (mult_valid_o), - .rs1_forwarding_o (rs1_forwarding_xlen), - .rs2_forwarding_o (rs2_forwarding_xlen), - .stall_issue_o (stall_issue_o), - .tinst_o (tinst_o), - .* + .clk_i, + .rst_ni, + .flush_i (flush_unissued_instr_i), + .stall_i, + .issue_instr_i (issue_instr_sb_iro), + .orig_instr_i (orig_instr_sb_iro), + .issue_instr_valid_i (issue_instr_valid_sb_iro), + .issue_ack_o (issue_ack_iro_sb), + .fwd_i (fwd), + .fu_data_o (fu_data_o), + .rs1_forwarding_o (rs1_forwarding_xlen), + .rs2_forwarding_o (rs2_forwarding_xlen), + .pc_o, + .is_compressed_instr_o, + .flu_ready_i (flu_ready_i), + .alu_valid_o (alu_valid_o), + .branch_valid_o (branch_valid_o), + .tinst_o (tinst_o), + .branch_predict_o, + .lsu_ready_i, + .lsu_valid_o, + .mult_valid_o, + .fpu_ready_i, + .fpu_valid_o, + .fpu_fmt_o, + .fpu_rm_o, + .alu2_valid_o, + .csr_valid_o, + .cvxif_valid_o (xfu_valid_o), + .cvxif_ready_i (xfu_ready_i), + .cvxif_off_instr_o (x_off_instr_o), + .hart_id_i (hart_id_i), + .x_issue_ready_i (x_issue_ready_i), + .x_issue_resp_i (x_issue_resp_i), + .x_issue_valid_o (x_issue_valid_o), + .x_issue_req_o (x_issue_req_o), + .x_register_ready_i (x_register_ready_i), + .x_register_valid_o (x_register_valid_o), + .x_register_o (x_register_o), + .x_commit_valid_o (x_commit_valid_o), + .x_commit_o (x_commit_o), + .x_transaction_accepted_o(x_transaction_accepted_iro_sb), + .x_transaction_rejected_o(x_transaction_rejected_o), + .x_issue_writeback_o (x_issue_writeback_iro_sb), + .x_id_o (x_id_iro_sb), + .waddr_i, + .wdata_i, + .we_gpr_i, + .we_fpr_i, + .stall_issue_o ); endmodule diff --git a/core/load_store_unit.sv b/core/load_store_unit.sv index 119bf1c020..04c4e00e07 100644 --- a/core/load_store_unit.sv +++ b/core/load_store_unit.sv @@ -148,9 +148,9 @@ module load_store_unit input amo_resp_t amo_resp_i, // PMP configuration - CSR_REGFILE - input riscv::pmpcfg_t [CVA6Cfg.NrPMPEntries:0] pmpcfg_i, + input riscv::pmpcfg_t [CVA6Cfg.NrPMPEntries-1:0] pmpcfg_i, // PMP address - CSR_REGFILE - input logic [CVA6Cfg.NrPMPEntries:0][CVA6Cfg.PLEN-3:0] pmpaddr_i, + input logic [CVA6Cfg.NrPMPEntries-1:0][CVA6Cfg.PLEN-3:0] pmpaddr_i, // RVFI inforamtion - RVFI output lsu_ctrl_t rvfi_lsu_ctrl_o, @@ -256,6 +256,10 @@ module load_store_unit .clk_i(clk_i), .rst_ni(rst_ni), .flush_i(flush_i), + .enable_translation_i, + .enable_g_translation_i, + .en_ld_st_translation_i, + .en_ld_st_g_translation_i, .icache_areq_i(icache_areq_i), .icache_areq_o(icache_areq_o), // misaligned bypass @@ -273,10 +277,29 @@ module load_store_unit .lsu_exception_o(mmu_exception), .priv_lvl_i (priv_lvl_i), + .v_i, .ld_st_priv_lvl_i(ld_st_priv_lvl_i), + .ld_st_v_i, + .sum_i, + .vs_sum_i, + .mxr_i, + .vmxr_i, .hlvx_inst_i (mmu_hlvx_inst), .hs_ld_st_inst_i(mmu_hs_ld_st_inst), + .satp_ppn_i, + .vsatp_ppn_i, + .hgatp_ppn_i, + .asid_i, + .vs_asid_i, + .asid_to_be_flushed_i, + .vmid_i, + .vmid_to_be_flushed_i, + .vaddr_to_be_flushed_i, + .gpaddr_to_be_flushed_i, + .flush_tlb_i, + .flush_tlb_vvma_i, + .flush_tlb_gvma_i, .itlb_miss_o(itlb_miss_o), .dtlb_miss_o(dtlb_miss_o), @@ -284,8 +307,7 @@ module load_store_unit .req_port_i(dcache_req_ports_i[0]), .req_port_o(dcache_req_ports_o[0]), .pmpcfg_i, - .pmpaddr_i, - .* + .pmpaddr_i ); end else begin : gen_no_mmu @@ -391,6 +413,9 @@ module load_store_unit .exception_t(exception_t), .lsu_ctrl_t(lsu_ctrl_t) ) i_load_unit ( + .clk_i, + .rst_ni, + .flush_i, .valid_i (ld_valid_i), .lsu_ctrl_i(lsu_ctrl), .pop_ld_o (pop_ld), @@ -413,12 +438,11 @@ module load_store_unit .page_offset_o (page_offset), .page_offset_matches_i(page_offset_matches), .store_buffer_empty_i (store_buffer_empty), + .commit_tran_id_i, // to memory arbiter .req_port_i (dcache_req_ports_i[1]), .req_port_o (dcache_req_ports_o[1]), - .dcache_wbuffer_not_ni_i, - .commit_tran_id_i, - .* + .dcache_wbuffer_not_ni_i ); // ---------------------------- @@ -541,17 +565,22 @@ module load_store_unit data_misaligned = 1'b0; if (lsu_ctrl.valid) begin - case (lsu_ctrl.operation) - // double word - LD, SD, FLD, FSD, - AMO_LRD, AMO_SCD, - AMO_SWAPD, AMO_ADDD, AMO_ANDD, AMO_ORD, - AMO_XORD, AMO_MAXD, AMO_MAXDU, AMO_MIND, - AMO_MINDU, HLV_D, HSV_D: begin - if (CVA6Cfg.IS_XLEN64 && lsu_ctrl.vaddr[2:0] != 3'b000) begin - data_misaligned = 1'b1; + if (CVA6Cfg.IS_XLEN64) begin + case (lsu_ctrl.operation) + // double word + LD, SD, FLD, FSD, + AMO_LRD, AMO_SCD, + AMO_SWAPD, AMO_ADDD, AMO_ANDD, AMO_ORD, + AMO_XORD, AMO_MAXD, AMO_MAXDU, AMO_MIND, + AMO_MINDU, HLV_D, HSV_D: begin + if (lsu_ctrl.vaddr[2:0] != 3'b000) begin + data_misaligned = 1'b1; + end end - end + default: ; + endcase + end + case (lsu_ctrl.operation) // word LW, LWU, SW, FLW, FSW, AMO_LRW, AMO_SCW, @@ -574,83 +603,94 @@ module load_store_unit end if (data_misaligned) begin - - if (lsu_ctrl.fu == LOAD) begin - misaligned_exception.cause = riscv::LD_ADDR_MISALIGNED; - misaligned_exception.valid = 1'b1; - if (CVA6Cfg.TvalEn) - misaligned_exception.tval = {{CVA6Cfg.XLEN - CVA6Cfg.VLEN{1'b0}}, lsu_ctrl.vaddr}; - if (CVA6Cfg.RVH) begin - misaligned_exception.tval2 = '0; - misaligned_exception.tinst = lsu_ctrl.tinst; - misaligned_exception.gva = ld_st_v_i; + case (lsu_ctrl.fu) + LOAD: begin + misaligned_exception.cause = riscv::LD_ADDR_MISALIGNED; + misaligned_exception.valid = 1'b1; + if (CVA6Cfg.TvalEn) + misaligned_exception.tval = {{CVA6Cfg.XLEN - CVA6Cfg.VLEN{1'b0}}, lsu_ctrl.vaddr}; + if (CVA6Cfg.RVH) begin + misaligned_exception.tval2 = '0; + misaligned_exception.tinst = lsu_ctrl.tinst; + misaligned_exception.gva = ld_st_v_i; + end end - - end else if (lsu_ctrl.fu == STORE) begin - misaligned_exception.cause = riscv::ST_ADDR_MISALIGNED; - misaligned_exception.valid = 1'b1; - if (CVA6Cfg.TvalEn) - misaligned_exception.tval = {{CVA6Cfg.XLEN - CVA6Cfg.VLEN{1'b0}}, lsu_ctrl.vaddr}; - if (CVA6Cfg.RVH) begin - misaligned_exception.tval2 = '0; - misaligned_exception.tinst = lsu_ctrl.tinst; - misaligned_exception.gva = ld_st_v_i; + STORE: begin + + misaligned_exception.cause = riscv::ST_ADDR_MISALIGNED; + misaligned_exception.valid = 1'b1; + if (CVA6Cfg.TvalEn) + misaligned_exception.tval = {{CVA6Cfg.XLEN - CVA6Cfg.VLEN{1'b0}}, lsu_ctrl.vaddr}; + if (CVA6Cfg.RVH) begin + misaligned_exception.tval2 = '0; + misaligned_exception.tinst = lsu_ctrl.tinst; + misaligned_exception.gva = ld_st_v_i; + end end - end + default: ; + endcase end if (CVA6Cfg.MmuPresent && en_ld_st_translation_i && lsu_ctrl.overflow) begin - if (lsu_ctrl.fu == LOAD) begin - misaligned_exception.cause = riscv::LOAD_PAGE_FAULT; - misaligned_exception.valid = 1'b1; - if (CVA6Cfg.TvalEn) - misaligned_exception.tval = {{CVA6Cfg.XLEN - CVA6Cfg.VLEN{1'b0}}, lsu_ctrl.vaddr}; - if (CVA6Cfg.RVH) begin - misaligned_exception.tval2 = '0; - misaligned_exception.tinst = lsu_ctrl.tinst; - misaligned_exception.gva = ld_st_v_i; + case (lsu_ctrl.fu) + LOAD: begin + misaligned_exception.cause = riscv::LOAD_PAGE_FAULT; + misaligned_exception.valid = 1'b1; + if (CVA6Cfg.TvalEn) + misaligned_exception.tval = {{CVA6Cfg.XLEN - CVA6Cfg.VLEN{1'b0}}, lsu_ctrl.vaddr}; + if (CVA6Cfg.RVH) begin + misaligned_exception.tval2 = '0; + misaligned_exception.tinst = lsu_ctrl.tinst; + misaligned_exception.gva = ld_st_v_i; + end end - - end else if (lsu_ctrl.fu == STORE) begin - misaligned_exception.cause = riscv::STORE_PAGE_FAULT; - misaligned_exception.valid = 1'b1; - if (CVA6Cfg.TvalEn) - misaligned_exception.tval = {{CVA6Cfg.XLEN - CVA6Cfg.VLEN{1'b0}}, lsu_ctrl.vaddr}; - if (CVA6Cfg.RVH) begin - misaligned_exception.tval2 = '0; - misaligned_exception.tinst = lsu_ctrl.tinst; - misaligned_exception.gva = ld_st_v_i; + STORE: begin + misaligned_exception.cause = riscv::STORE_PAGE_FAULT; + misaligned_exception.valid = 1'b1; + if (CVA6Cfg.TvalEn) + misaligned_exception.tval = {{CVA6Cfg.XLEN - CVA6Cfg.VLEN{1'b0}}, lsu_ctrl.vaddr}; + if (CVA6Cfg.RVH) begin + misaligned_exception.tval2 = '0; + misaligned_exception.tinst = lsu_ctrl.tinst; + misaligned_exception.gva = ld_st_v_i; + end end - end + default: ; + endcase end if (CVA6Cfg.MmuPresent && CVA6Cfg.RVH && en_ld_st_g_translation_i && !en_ld_st_translation_i && lsu_ctrl.g_overflow) begin - if (lsu_ctrl.fu == LOAD) begin - misaligned_exception.cause = riscv::LOAD_GUEST_PAGE_FAULT; - misaligned_exception.valid = 1'b1; - if (CVA6Cfg.TvalEn) - misaligned_exception.tval = {{CVA6Cfg.XLEN - CVA6Cfg.VLEN{1'b0}}, lsu_ctrl.vaddr}; - if (CVA6Cfg.RVH) begin - misaligned_exception.tval2 = '0; - misaligned_exception.tinst = lsu_ctrl.tinst; - misaligned_exception.gva = ld_st_v_i; + case (lsu_ctrl.fu) + LOAD: begin + misaligned_exception.cause = riscv::LOAD_GUEST_PAGE_FAULT; + misaligned_exception.valid = 1'b1; + if (CVA6Cfg.TvalEn) + misaligned_exception.tval = {{CVA6Cfg.XLEN - CVA6Cfg.VLEN{1'b0}}, lsu_ctrl.vaddr}; + if (CVA6Cfg.RVH) begin + misaligned_exception.tval2 = '0; + misaligned_exception.tinst = lsu_ctrl.tinst; + misaligned_exception.gva = ld_st_v_i; + end end - end else if (lsu_ctrl.fu == STORE) begin - misaligned_exception.cause = riscv::STORE_GUEST_PAGE_FAULT; - misaligned_exception.valid = 1'b1; - if (CVA6Cfg.TvalEn) - misaligned_exception.tval = {{CVA6Cfg.XLEN - CVA6Cfg.VLEN{1'b0}}, lsu_ctrl.vaddr}; - if (CVA6Cfg.RVH) begin - misaligned_exception.tval2 = '0; - misaligned_exception.tinst = lsu_ctrl.tinst; - misaligned_exception.gva = ld_st_v_i; + STORE: begin + misaligned_exception.cause = riscv::STORE_GUEST_PAGE_FAULT; + misaligned_exception.valid = 1'b1; + if (CVA6Cfg.TvalEn) + misaligned_exception.tval = {{CVA6Cfg.XLEN - CVA6Cfg.VLEN{1'b0}}, lsu_ctrl.vaddr}; + if (CVA6Cfg.RVH) begin + misaligned_exception.tval2 = '0; + misaligned_exception.tinst = lsu_ctrl.tinst; + misaligned_exception.gva = ld_st_v_i; + end end - end + default: ; + endcase end end + // ------------------ // LSU Control // ------------------ @@ -676,17 +716,20 @@ module load_store_unit .CVA6Cfg(CVA6Cfg), .lsu_ctrl_t(lsu_ctrl_t) ) lsu_bypass_i ( + .clk_i, + .rst_ni, + .flush_i, .lsu_req_i (lsu_req_i), .lsu_req_valid_i(lsu_valid_i), .pop_ld_i (pop_ld), .pop_st_i (pop_st), .lsu_ctrl_o(lsu_ctrl), - .ready_o (lsu_ready_o), - .* + .ready_o (lsu_ready_o) ); assign rvfi_lsu_ctrl_o = lsu_ctrl; endmodule + diff --git a/core/load_unit.sv b/core/load_unit.sv index beb953cbd5..2109b08ded 100644 --- a/core/load_unit.sv +++ b/core/load_unit.sv @@ -31,53 +31,53 @@ module load_unit input logic clk_i, // Asynchronous reset active low - SUBSYSTEM input logic rst_ni, - // TO_BE_COMPLETED - TO_BE_COMPLETED + // Flush signal - CONTROLLER input logic flush_i, - // Load unit input port - TO_BE_COMPLETED + // Load request is valid - LSU_BYPASS input logic valid_i, - // TO_BE_COMPLETED - TO_BE_COMPLETED + // Load request input - LSU_BYPASS input lsu_ctrl_t lsu_ctrl_i, - // TO_BE_COMPLETED - TO_BE_COMPLETED + // Pop the load request from the LSU bypass FIFO - LSU_BYPASS output logic pop_ld_o, - // Load unit result is valid - TO_BE_COMPLETED + // Load unit result is valid - ISSUE_STAGE output logic valid_o, - // Load transaction ID - TO_BE_COMPLETED + // Load transaction ID - ISSUE_STAGE output logic [CVA6Cfg.TRANS_ID_BITS-1:0] trans_id_o, - // Load result - TO_BE_COMPLETED + // Load result - ISSUE_STAGE output logic [CVA6Cfg.XLEN-1:0] result_o, - // Load exception - TO_BE_COMPLETED + // Load exception - ISSUE_STAGE output exception_t ex_o, - // Request address translation - TO_BE_COMPLETED + // Request address translation - MMU output logic translation_req_o, - // Virtual address - TO_BE_COMPLETED + // Virtual address - MMU output logic [CVA6Cfg.VLEN-1:0] vaddr_o, - // Transformed trap instruction out - TO_BE_COMPLETED + // Transformed trap instruction out - MMU output logic [31:0] tinst_o, - // Instruction is a hyp load store instruction - TO_BE_COMPLETED + // Instruction is a hyp load store instruction - MMU output logic hs_ld_st_inst_o, - // Hyp load store with execute permissions - TO_BE_COMPLETED + // Hyp load store with execute permissions - MMU output logic hlvx_inst_o, - // Physical address - TO_BE_COMPLETED + // Physical address - MMU input logic [CVA6Cfg.PLEN-1:0] paddr_i, - // Excepted which appears before load - TO_BE_COMPLETED + // Excepted which appears before load - MMU input exception_t ex_i, - // Data TLB hit - lsu + // Data TLB hit - MMU input logic dtlb_hit_i, - // TO_BE_COMPLETED - TO_BE_COMPLETED + // Physical page number from the DTLB - MMU input logic [CVA6Cfg.PPNW-1:0] dtlb_ppn_i, - // TO_BE_COMPLETED - TO_BE_COMPLETED + // Page offset for address checking - STORE_UNIT output logic [11:0] page_offset_o, - // TO_BE_COMPLETED - TO_BE_COMPLETED + // Indicates if the page offset matches a store unit entry - STORE_UNIT input logic page_offset_matches_i, - // Store buffer is empty - TO_BE_COMPLETED + // Store buffer is empty - STORE_UNIT input logic store_buffer_empty_i, - // TO_BE_COMPLETED - TO_BE_COMPLETED + // Transaction ID of the committing instruction - COMMIT_STAGE input logic [CVA6Cfg.TRANS_ID_BITS-1:0] commit_tran_id_i, // Data cache request out - CACHES input dcache_req_o_t req_port_i, // Data cache request in - CACHES output dcache_req_i_t req_port_o, - // TO_BE_COMPLETED - TO_BE_COMPLETED + // Presence of non-idempotent operations in the D$ write buffer - CACHES input logic dcache_wbuffer_not_ni_i ); enum logic [3:0] { @@ -505,7 +505,7 @@ module load_unit // select correct sign bit in parallel to result shifter above // pull to 0 if unsigned - assign rdata_sign_bit = rdata_is_signed & rdata_sign_bits[rdata_offset] | rdata_is_fp_signed; + assign rdata_sign_bit = rdata_is_signed & rdata_sign_bits[rdata_offset] | (CVA6Cfg.FpPresent && rdata_is_fp_signed); // result mux always_comb begin diff --git a/core/round_interval.sv b/core/round_interval.sv deleted file mode 100644 index 9026347bcd..0000000000 --- a/core/round_interval.sv +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright 2024 Thales Silicon Security -// -// Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0 -// You may obtain a copy of the License at https://solderpad.org/licenses/ -// -// Original Author: Côme ALLART - Thales - -module round_interval #( - parameter int unsigned S = 1, - parameter int unsigned L = 1 << S -) ( - // Start index - // Included in the interval - input logic [S-1:0] start_i, - // Stop index - // Not included in the interval - input logic [S-1:0] stop_i, - - // The interval from start to stop, rounding - // Considered full when start_i == stop_i - output logic [L-1:0] active_o -); - - // Bit high at index start/stop - logic [L-1:0] a; - logic [L-1:0] b; - - for (genvar i = 0; i < L; i++) begin - assign a[i] = start_i == i; - assign b[i] = stop_i == i; - end - - // Propagation to the higher indexes: >= - logic [L-1:0] ge_a; - logic [L-1:0] ge_b; - - assign ge_b[0] = b[0]; - assign ge_a[0] = a[0]; - for (genvar i = 1; i < L; i++) begin - assign ge_b[i] = ge_b[i-1] || b[i]; - assign ge_a[i] = ge_a[i-1] || a[i]; - end - - // < is the negation of >= - logic [L-1:0] lt_b; - assign lt_b = ~ge_b; - - // Build the interval - assign active_o = (start_i <= stop_i) ? lt_b & ge_a : lt_b | ge_a; - -endmodule diff --git a/core/scoreboard.sv b/core/scoreboard.sv index 5e74b1511f..6a1b3d094f 100644 --- a/core/scoreboard.sv +++ b/core/scoreboard.sv @@ -17,89 +17,76 @@ module scoreboard #( parameter type bp_resolve_t = logic, parameter type exception_t = logic, parameter type scoreboard_entry_t = logic, + parameter type forwarding_t = logic, + parameter type writeback_t = logic, parameter type rs3_len_t = logic ) ( // Subsystem Clock - SUBSYSTEM - input logic clk_i, + input logic clk_i, // Asynchronous reset active low - SUBSYSTEM - input logic rst_ni, - // TO_BE_COMPLETED - TO_BE_COMPLETED - output logic sb_full_o, - // Flush only un-issued instructions - TO_BE_COMPLETED - input logic flush_unissued_instr_i, - // Flush whole scoreboard - TO_BE_COMPLETED - input logic flush_i, - // TO_BE_COMPLETED - TO_BE_COMPLETED - output ariane_pkg::fu_t [2**ariane_pkg::REG_ADDR_SIZE-1:0] rd_clobber_gpr_o, - // TO_BE_COMPLETED - TO_BE_COMPLETED - output ariane_pkg::fu_t [2**ariane_pkg::REG_ADDR_SIZE-1:0] rd_clobber_fpr_o, - - // rs1 operand address - issue_read_operands - input logic [CVA6Cfg.NrIssuePorts-1:0][ariane_pkg::REG_ADDR_SIZE-1:0] rs1_i, - // rs1 operand - issue_read_operands - output logic [CVA6Cfg.NrIssuePorts-1:0][ CVA6Cfg.XLEN-1:0] rs1_o, - // rs1 operand is valid - issue_read_operands - output logic [CVA6Cfg.NrIssuePorts-1:0] rs1_valid_o, - - // rs2 operand address - issue_read_operands - input logic [CVA6Cfg.NrIssuePorts-1:0][ariane_pkg::REG_ADDR_SIZE-1:0] rs2_i, - // rs2 operand - issue_read_operands - output logic [CVA6Cfg.NrIssuePorts-1:0][ CVA6Cfg.XLEN-1:0] rs2_o, - // rs2 operand is valid - issue_read_operands - output logic [CVA6Cfg.NrIssuePorts-1:0] rs2_valid_o, - - // rs3 operand address - issue_read_operands - input logic [CVA6Cfg.NrIssuePorts-1:0][ariane_pkg::REG_ADDR_SIZE-1:0] rs3_i, - // rs3 operand - issue_read_operands - output rs3_len_t [CVA6Cfg.NrIssuePorts-1:0] rs3_o, - // rs3 operand is valid - issue_read_operands - output logic [CVA6Cfg.NrIssuePorts-1:0] rs3_valid_o, - + input logic rst_ni, + // Is scoreboard full - PERF_COUNTERS + output logic sb_full_o, + // Prevent from issuing - CONTROLLER + input logic flush_unissued_instr_i, + // Flush whole scoreboard - CONTROLLER + input logic flush_i, + // Writeback Handling of CVXIF + // TO_BE_COMPLETED - ISSUE_READ_OPERANDS + input logic x_transaction_accepted_i, + // TO_BE_COMPLETED - ISSUE_READ_OPERANDS + input logic x_issue_writeback_i, + // TO_BE_COMPLETED - ISSUE_READ_OPERANDS + input logic [CVA6Cfg.TRANS_ID_BITS-1:0] x_id_i, // advertise instruction to commit stage, if commit_ack_i is asserted advance the commit pointer - // TO_BE_COMPLETED - TO_BE_COMPLETED + // Instructions to commit - COMMIT_STAGE output scoreboard_entry_t [CVA6Cfg.NrCommitPorts-1:0] commit_instr_o, - // TO_BE_COMPLETED - TO_BE_COMPLETED + // Instruction is cancelled - COMMIT_STAGE output logic [CVA6Cfg.NrCommitPorts-1:0] commit_drop_o, - // TO_BE_COMPLETED - TO_BE_COMPLETED + // Commit acknowledge - COMMIT_STAGE input logic [CVA6Cfg.NrCommitPorts-1:0] commit_ack_i, // instruction to put on top of scoreboard e.g.: top pointer // we can always put this instruction to the top unless we signal with asserted full_o - // TO_BE_COMPLETED - TO_BE_COMPLETED + // Handshake's data with decode stage - ID_STAGE input scoreboard_entry_t [CVA6Cfg.NrIssuePorts-1:0] decoded_instr_i, - // TO_BE_COMPLETED - TO_BE_COMPLETED + // instruction value - ID_STAGE input logic [CVA6Cfg.NrIssuePorts-1:0][31:0] orig_instr_i, - // TO_BE_COMPLETED - TO_BE_COMPLETED + // Handshake's valid with decode stage - ID_STAGE input logic [CVA6Cfg.NrIssuePorts-1:0] decoded_instr_valid_i, - // TO_BE_COMPLETED - TO_BE_COMPLETED + // Handshake's acknowlege with decode stage - ID_STAGE output logic [CVA6Cfg.NrIssuePorts-1:0] decoded_instr_ack_o, // instruction to issue logic, if issue_instr_valid and issue_ready is asserted, advance the issue pointer - // Issue scoreboard entry - ACC_DISPATCHER + // Entry about the instruction to issue - ISSUE_READ_OPERANDS output scoreboard_entry_t [CVA6Cfg.NrIssuePorts-1:0] issue_instr_o, - // TO_BE_COMPLETED - TO_BE_COMPLETED + // Instruction to issue - ISSUE_READ_OPERANDS output logic [CVA6Cfg.NrIssuePorts-1:0][31:0] orig_instr_o, - // TO_BE_COMPLETED - TO_BE_COMPLETED + // Is there an instruction to issue - ISSUE_READ_OPERANDS output logic [CVA6Cfg.NrIssuePorts-1:0] issue_instr_valid_o, - // TO_BE_COMPLETED - TO_BE_COMPLETED + // Issue stage acknowledge - ISSUE_READ_OPERANDS input logic [CVA6Cfg.NrIssuePorts-1:0] issue_ack_i, + // Forwarding - ISSUE_READ_OPERANDS + output forwarding_t fwd_o, - // TO_BE_COMPLETED - TO_BE_COMPLETED + // Result from branch unit - EX_STAGE input bp_resolve_t resolved_branch_i, - // Transaction ID at which to write the result back - TO_BE_COMPLETED + // Transaction ID at which to write the result back - EX_STAGE input logic [CVA6Cfg.NrWbPorts-1:0][CVA6Cfg.TRANS_ID_BITS-1:0] trans_id_i, - // Results to write back - TO_BE_COMPLETED + // Results to write back - EX_STAGE input logic [CVA6Cfg.NrWbPorts-1:0][CVA6Cfg.XLEN-1:0] wbdata_i, - // Exception from a functional unit (e.g.: ld/st exception) - TO_BE_COMPLETED + // Exception from a functional unit (e.g.: ld/st exception) - EX_STAGE input exception_t [CVA6Cfg.NrWbPorts-1:0] ex_i, - // Indicates valid results - TO_BE_COMPLETED + // Indicates valid results - EX_STAGE input logic [CVA6Cfg.NrWbPorts-1:0] wt_valid_i, - // Cvxif we for writeback - TO_BE_COMPLETED + // Cvxif we for writeback - EX_STAGE input logic x_we_i, + // CVXIF destination register - ISSUE_STAGE + input logic [4:0] x_rd_i, - // TO_BE_COMPLETED - RVFI + // Issue pointer - RVFI output logic [ CVA6Cfg.NrIssuePorts-1:0][CVA6Cfg.TRANS_ID_BITS-1:0] rvfi_issue_pointer_o, - // TO_BE_COMPLETED - RVFI + // Commit pointer - RVFI output logic [CVA6Cfg.NrCommitPorts-1:0][CVA6Cfg.TRANS_ID_BITS-1:0] rvfi_commit_pointer_o ); @@ -161,9 +148,8 @@ module scoreboard #( // an instruction is ready for issue if we have place in the issue FIFO and it the decoder says it is valid always_comb begin - decoded_instr_ack_o = '0; - issue_instr_o = decoded_instr_i; - orig_instr_o = orig_instr_i; + issue_instr_o = decoded_instr_i; + orig_instr_o = orig_instr_i; for (int unsigned i = 0; i < CVA6Cfg.NrIssuePorts; i++) begin // make sure we assign the correct trans ID issue_instr_o[i].trans_id = issue_pointer[i]; @@ -225,8 +211,9 @@ module scoreboard #( if (CVA6Cfg.DebugEn) begin mem_n[trans_id_i[i]].sbe.bp.predict_address = resolved_branch_i.target_address; end - if (mem_n[trans_id_i[i]].sbe.fu == ariane_pkg::CVXIF && ~x_we_i) begin - mem_n[trans_id_i[i]].sbe.rd = 5'b0; + if (mem_n[trans_id_i[i]].sbe.fu == ariane_pkg::CVXIF) begin + if (x_we_i) mem_n[trans_id_i[i]].sbe.rd = x_rd_i; + else mem_n[trans_id_i[i]].sbe.rd = 5'b0; end // write the exception back if it is valid if (ex_i[i].valid) mem_n[trans_id_i[i]].sbe.ex = ex_i[i]; @@ -242,10 +229,8 @@ module scoreboard #( // ------------ if (CVA6Cfg.SpeculativeSb) begin if (bmiss) begin - for (int unsigned i = 0; i < CVA6Cfg.NR_SB_ENTRIES; i++) begin - if (speculative_instrs[i]) begin - mem_n[i].cancelled = 1'b1; - end + if (after_flu_wb != issue_pointer[0]) begin + mem_n[after_flu_wb].cancelled = 1'b1; end end end @@ -280,16 +265,6 @@ module scoreboard #( assign bmiss = resolved_branch_i.valid && resolved_branch_i.is_mispredict; assign after_flu_wb = trans_id_i[ariane_pkg::FLU_WB] + 'd1; - if (CVA6Cfg.SpeculativeSb) begin : find_speculative_instrs - round_interval #( - .S(CVA6Cfg.TRANS_ID_BITS) - ) i_speculative_instrs ( - .start_i (after_flu_wb), - .stop_i (issue_pointer_q), - .active_o(speculative_instrs) - ); - end - // FIFO counter updates if (CVA6Cfg.NrCommitPorts == 2) begin : gen_commit_ports assign num_commit = commit_ack_i[1] + commit_ack_i[0]; @@ -309,194 +284,22 @@ module scoreboard #( assign commit_pointer_n[k] = (flush_i) ? '0 : commit_pointer_n[0] + unsigned'(k); end - // ------------------- - // RD clobber process - // ------------------- - // rd_clobber output: output currently clobbered destination registers - logic [2**ariane_pkg::REG_ADDR_SIZE-1:0][CVA6Cfg.NR_SB_ENTRIES:0] gpr_clobber_vld; - logic [2**ariane_pkg::REG_ADDR_SIZE-1:0][CVA6Cfg.NR_SB_ENTRIES:0] fpr_clobber_vld; - ariane_pkg::fu_t [ CVA6Cfg.NR_SB_ENTRIES:0] clobber_fu; - - always_comb begin : clobber_assign - gpr_clobber_vld = '0; - fpr_clobber_vld = '0; - - // default (highest entry hast lowest prio in arbiter tree below) - clobber_fu[CVA6Cfg.NR_SB_ENTRIES] = ariane_pkg::NONE; - for (int unsigned i = 0; i < 2 ** ariane_pkg::REG_ADDR_SIZE; i++) begin - gpr_clobber_vld[i][CVA6Cfg.NR_SB_ENTRIES] = 1'b1; - fpr_clobber_vld[i][CVA6Cfg.NR_SB_ENTRIES] = 1'b1; - end - - // check for all valid entries and set the clobber accordingly - for (int unsigned i = 0; i < CVA6Cfg.NR_SB_ENTRIES; i++) begin - gpr_clobber_vld[mem_q[i].sbe.rd][i] = still_issued[i] & ~mem_q[i].is_rd_fpr_flag; - fpr_clobber_vld[mem_q[i].sbe.rd][i] = still_issued[i] & mem_q[i].is_rd_fpr_flag; - clobber_fu[i] = mem_q[i].sbe.fu; - end - - // GPR[0] is always free - gpr_clobber_vld[0] = '0; - end - - for (genvar k = 0; k < 2 ** ariane_pkg::REG_ADDR_SIZE; k++) begin : gen_sel_clobbers - // get fu that is going to clobber this register (there should be only one) - rr_arb_tree #( - .NumIn(CVA6Cfg.NR_SB_ENTRIES + 1), - .DataType(ariane_pkg::fu_t), - .ExtPrio(1'b1), - .AxiVldRdy(1'b1) - ) i_sel_gpr_clobbers ( - .clk_i (clk_i), - .rst_ni (rst_ni), - .flush_i(1'b0), - .rr_i ('0), - .req_i (gpr_clobber_vld[k]), - .gnt_o (), - .data_i (clobber_fu), - .gnt_i (1'b1), - .req_o (), - .data_o (rd_clobber_gpr_o[k]), - .idx_o () - ); - if (CVA6Cfg.FpPresent) begin - rr_arb_tree #( - .NumIn(CVA6Cfg.NR_SB_ENTRIES + 1), - .DataType(ariane_pkg::fu_t), - .ExtPrio(1'b1), - .AxiVldRdy(1'b1) - ) i_sel_fpr_clobbers ( - .clk_i (clk_i), - .rst_ni (rst_ni), - .flush_i(1'b0), - .rr_i ('0), - .req_i (fpr_clobber_vld[k]), - .gnt_o (), - .data_i (clobber_fu), - .gnt_i (1'b1), - .req_o (), - .data_o (rd_clobber_fpr_o[k]), - .idx_o () - ); - end + // Forwarding logic + writeback_t [CVA6Cfg.NrWbPorts-1:0] wb; + for (genvar i = 0; i < CVA6Cfg.NrWbPorts; i++) begin + assign wb[i].valid = wt_valid_i[i]; + assign wb[i].data = wbdata_i[i]; + assign wb[i].ex_valid = ex_i[i].valid; + assign wb[i].trans_id = trans_id_i[i]; end - // ---------------------------------- - // Read Operands (a.k.a forwarding) - // ---------------------------------- - // read operand interface: same logic as register file - logic [CVA6Cfg.NrIssuePorts-1:0][CVA6Cfg.NR_SB_ENTRIES+CVA6Cfg.NrWbPorts-1:0] - rs1_fwd_req, rs2_fwd_req, rs3_fwd_req; - logic [CVA6Cfg.NrIssuePorts-1:0][CVA6Cfg.NR_SB_ENTRIES+CVA6Cfg.NrWbPorts-1:0][CVA6Cfg.XLEN-1:0] rs_data; - logic [CVA6Cfg.NrIssuePorts-1:0] rs1_valid, rs2_valid, rs3_valid; - - // WB ports have higher prio than entries - for (genvar i = 0; i < CVA6Cfg.NrIssuePorts; i++) begin - for (genvar k = 0; unsigned'(k) < CVA6Cfg.NrWbPorts; k++) begin : gen_rs_wb - assign rs1_fwd_req[i][k] = (mem_q[trans_id_i[k]].sbe.rd == rs1_i[i]) & (~mem_q[trans_id_i[k]].cancelled) & wt_valid_i[k] & (~ex_i[k].valid) & (mem_q[trans_id_i[k]].is_rd_fpr_flag == (CVA6Cfg.FpPresent && ariane_pkg::is_rs1_fpr( - issue_instr_o[i].op - ))); - assign rs2_fwd_req[i][k] = (mem_q[trans_id_i[k]].sbe.rd == rs2_i[i]) & (~mem_q[trans_id_i[k]].cancelled) & wt_valid_i[k] & (~ex_i[k].valid) & (mem_q[trans_id_i[k]].is_rd_fpr_flag == (CVA6Cfg.FpPresent && ariane_pkg::is_rs2_fpr( - issue_instr_o[i].op - ))); - assign rs3_fwd_req[i][k] = (mem_q[trans_id_i[k]].sbe.rd == rs3_i[i]) & (~mem_q[trans_id_i[k]].cancelled) & wt_valid_i[k] & (~ex_i[k].valid) & (mem_q[trans_id_i[k]].is_rd_fpr_flag == (CVA6Cfg.FpPresent && ariane_pkg::is_imm_fpr( - issue_instr_o[i].op - ))); - assign rs_data[i][k] = wbdata_i[k]; - end - for (genvar k = 0; unsigned'(k) < CVA6Cfg.NR_SB_ENTRIES; k++) begin : gen_rs_entries - assign rs1_fwd_req[i][k+CVA6Cfg.NrWbPorts] = (mem_q[k].sbe.rd == rs1_i[i]) & still_issued[k] & mem_q[k].sbe.valid & (mem_q[k].is_rd_fpr_flag == (CVA6Cfg.FpPresent && ariane_pkg::is_rs1_fpr( - issue_instr_o[i].op - ))); - assign rs2_fwd_req[i][k+CVA6Cfg.NrWbPorts] = (mem_q[k].sbe.rd == rs2_i[i]) & still_issued[k] & mem_q[k].sbe.valid & (mem_q[k].is_rd_fpr_flag == (CVA6Cfg.FpPresent && ariane_pkg::is_rs2_fpr( - issue_instr_o[i].op - ))); - assign rs3_fwd_req[i][k+CVA6Cfg.NrWbPorts] = (mem_q[k].sbe.rd == rs3_i[i]) & still_issued[k] & mem_q[k].sbe.valid & (mem_q[k].is_rd_fpr_flag == (CVA6Cfg.FpPresent && ariane_pkg::is_imm_fpr( - issue_instr_o[i].op - ))); - assign rs_data[i][k+CVA6Cfg.NrWbPorts] = mem_q[k].sbe.result; - end - - // check whether we are accessing GPR[0] - assign rs1_valid_o[i] = rs1_valid[i] & ((|rs1_i[i]) | (CVA6Cfg.FpPresent && ariane_pkg::is_rs1_fpr( - issue_instr_o[i].op - ))); - assign rs2_valid_o[i] = rs2_valid[i] & ((|rs2_i[i]) | (CVA6Cfg.FpPresent && ariane_pkg::is_rs2_fpr( - issue_instr_o[i].op - ))); - assign rs3_valid_o[i] = CVA6Cfg.NrRgprPorts == 3 ? rs3_valid[i] & ((|rs3_i[i]) | (CVA6Cfg.FpPresent && ariane_pkg::is_imm_fpr( - issue_instr_o[i].op - ))) : rs3_valid[i]; - - // use fixed prio here - // this implicitly gives higher prio to WB ports - rr_arb_tree #( - .NumIn(CVA6Cfg.NR_SB_ENTRIES + CVA6Cfg.NrWbPorts), - .DataWidth(CVA6Cfg.XLEN), - .ExtPrio(1'b1), - .AxiVldRdy(1'b1) - ) i_sel_rs1 ( - .clk_i (clk_i), - .rst_ni (rst_ni), - .flush_i(1'b0), - .rr_i ('0), - .req_i (rs1_fwd_req[i]), - .gnt_o (), - .data_i (rs_data[i]), - .gnt_i (1'b1), - .req_o (rs1_valid[i]), - .data_o (rs1_o[i]), - .idx_o () - ); - - rr_arb_tree #( - .NumIn(CVA6Cfg.NR_SB_ENTRIES + CVA6Cfg.NrWbPorts), - .DataWidth(CVA6Cfg.XLEN), - .ExtPrio(1'b1), - .AxiVldRdy(1'b1) - ) i_sel_rs2 ( - .clk_i (clk_i), - .rst_ni (rst_ni), - .flush_i(1'b0), - .rr_i ('0), - .req_i (rs2_fwd_req[i]), - .gnt_o (), - .data_i (rs_data[i]), - .gnt_i (1'b1), - .req_o (rs2_valid[i]), - .data_o (rs2_o[i]), - .idx_o () - ); - - logic [CVA6Cfg.NrIssuePorts-1:0][CVA6Cfg.XLEN-1:0] rs3; - - rr_arb_tree #( - .NumIn(CVA6Cfg.NR_SB_ENTRIES + CVA6Cfg.NrWbPorts), - .DataWidth(CVA6Cfg.XLEN), - .ExtPrio(1'b1), - .AxiVldRdy(1'b1) - ) i_sel_rs3 ( - .clk_i (clk_i), - .rst_ni (rst_ni), - .flush_i(1'b0), - .rr_i ('0), - .req_i (rs3_fwd_req[i]), - .gnt_o (), - .data_i (rs_data[i]), - .gnt_i (1'b1), - .req_o (rs3_valid[i]), - .data_o (rs3[i]), - .idx_o () - ); - - if (CVA6Cfg.NrRgprPorts == 3) begin : gen_gp_three_port - assign rs3_o[i] = rs3[i][riscv::XLEN-1:0]; - end else begin : gen_fp_three_port - assign rs3_o[i] = rs3[i][CVA6Cfg.FLen-1:0]; - end + assign fwd_o.still_issued = still_issued; + assign fwd_o.issue_pointer = issue_pointer; + assign fwd_o.wb = wb; + for (genvar i = 0; i < CVA6Cfg.NR_SB_ENTRIES; i++) begin + assign fwd_o.sbe[i] = mem_q[i].sbe; end - // sequential process always_ff @(posedge clk_i or negedge rst_ni) begin : regs if (!rst_ni) begin @@ -504,8 +307,9 @@ module scoreboard #( commit_pointer_q <= '0; issue_pointer_q <= '0; end else begin - issue_pointer_q <= issue_pointer_n; - mem_q <= mem_n; + issue_pointer_q <= issue_pointer_n; + mem_q <= mem_n; + mem_q[x_id_i].sbe.rd <= (x_transaction_accepted_i && ~x_issue_writeback_i) ? 5'b0 : mem_n[x_id_i].sbe.rd; commit_pointer_q <= commit_pointer_n; end end @@ -519,10 +323,6 @@ module scoreboard #( assert (CVA6Cfg.NR_SB_ENTRIES == 2 ** CVA6Cfg.TRANS_ID_BITS) else $fatal(1, "Scoreboard size needs to be a power of two."); end - - // assert that zero is never set - assert property (@(posedge clk_i) disable iff (!rst_ni) (rd_clobber_gpr_o[0] == ariane_pkg::NONE)) - else $fatal(1, "RD 0 should not bet set"); // assert that we never acknowledge a commit if the instruction is not valid assert property ( @(posedge clk_i) disable iff (!rst_ni) commit_ack_i[0] |-> commit_instr_o[0].valid) diff --git a/core/serdiv.sv b/core/serdiv.sv index 7e53ab608c..a9d3a1a30c 100644 --- a/core/serdiv.sv +++ b/core/serdiv.sv @@ -127,7 +127,7 @@ module serdiv assign shift_a = (lzc_a_no_one) ? WIDTH : {1'b0, lzc_a_result}; assign div_shift = {1'b0, lzc_b_result} - shift_a; - assign op_b = op_b_i <<< $unsigned(div_shift); + assign op_b = op_b_i <<< div_shift; // the division is zero if |opB| > |opA| and can be terminated assign div_res_zero_d = (load_en) ? div_shift[$high(div_shift)] : div_res_zero_q; diff --git a/core/store_unit.sv b/core/store_unit.sv index f6c373bc1b..d000623a25 100644 --- a/core/store_unit.sv +++ b/core/store_unit.sv @@ -99,10 +99,16 @@ module store_unit data_tmp[CVA6Cfg.XLEN-1:0] = {data[CVA6Cfg.XLEN-17:0], data[CVA6Cfg.XLEN-1:CVA6Cfg.XLEN-16]}; 3'b011: data_tmp[CVA6Cfg.XLEN-1:0] = {data[CVA6Cfg.XLEN-25:0], data[CVA6Cfg.XLEN-1:CVA6Cfg.XLEN-24]}; - 3'b100: data_tmp = {data[31:0], data[63:32]}; - 3'b101: data_tmp = {data[23:0], data[63:24]}; - 3'b110: data_tmp = {data[15:0], data[63:16]}; - 3'b111: data_tmp = {data[7:0], data[63:8]}; + default: + if (CVA6Cfg.IS_XLEN64) begin + case (addr_tmp) + 3'b100: data_tmp = {data[31:0], data[63:32]}; + 3'b101: data_tmp = {data[23:0], data[63:24]}; + 3'b110: data_tmp = {data[15:0], data[63:16]}; + 3'b111: data_tmp = {data[7:0], data[63:8]}; + default: data_tmp = {data[63:0]}; + endcase + end endcase return data_tmp[CVA6Cfg.XLEN-1:0]; endfunction @@ -273,8 +279,8 @@ module store_unit logic store_buffer_ready, amo_buffer_ready; // multiplex between store unit and amo buffer - assign store_buffer_valid = st_valid & (amo_op_q == AMO_NONE); - assign amo_buffer_valid = st_valid & (amo_op_q != AMO_NONE); + assign store_buffer_valid = st_valid & (!CVA6Cfg.RVA || (amo_op_q == AMO_NONE)); + assign amo_buffer_valid = st_valid & (CVA6Cfg.RVA && (amo_op_q != AMO_NONE)); assign st_ready = store_buffer_ready & amo_buffer_ready; diff --git a/corev_apu/src/ariane.sv b/corev_apu/src/ariane.sv index 38b3281bb9..866c6d6369 100644 --- a/corev_apu/src/ariane.sv +++ b/corev_apu/src/ariane.sv @@ -12,6 +12,7 @@ // Date: 19.03.2017 // Description: Ariane Top-level module +`include "cvxif_types.svh" module ariane import ariane_pkg::*; #( parameter config_pkg::cva6_cfg_t CVA6Cfg = config_pkg::cva6_cfg_empty, @@ -21,6 +22,21 @@ module ariane import ariane_pkg::*; #( logic csr; logic instr; }, + // CVXIF Types + localparam type readregflags_t = `READREGFLAGS_T(CVA6Cfg), + localparam type writeregflags_t = `WRITEREGFLAGS_T(CVA6Cfg), + localparam type id_t = `ID_T(CVA6Cfg), + localparam type hartid_t = `HARTID_T(CVA6Cfg), + localparam type x_compressed_req_t = `X_COMPRESSED_REQ_T(CVA6Cfg, hartid_t), + localparam type x_compressed_resp_t = `X_COMPRESSED_RESP_T(CVA6Cfg), + localparam type x_issue_req_t = `X_ISSUE_REQ_T(CVA6Cfg, hartit_t, id_t), + localparam type x_issue_resp_t = `X_ISSUE_RESP_T(CVA6Cfg, writeregflags_t, readregflags_t), + localparam type x_register_t = `X_REGISTER_T(CVA6Cfg, hartid_t, id_t, readregflags_t), + localparam type x_commit_t = `X_COMMIT_T(CVA6Cfg, hartid_t, id_t), + localparam type x_result_t = `X_RESULT_T(CVA6Cfg, hartid_t, id_t, writeregflags_t), + localparam type cvxif_req_t = `CVXIF_REQ_T(CVA6Cfg, x_compressed_req_t, x_issue_req_t, x_register_req_t, x_commit_t), + localparam type cvxif_resp_t = `CVXIF_RESP_T(CVA6Cfg, x_compressed_resp_t, x_issue_resp_t, x_result_t), + // AXI Types parameter int unsigned AxiAddrWidth = ariane_axi::AddrWidth, parameter int unsigned AxiDataWidth = ariane_axi::DataWidth, parameter int unsigned AxiIdWidth = ariane_axi::IdWidth, @@ -50,8 +66,8 @@ module ariane import ariane_pkg::*; #( input noc_resp_t noc_resp_i ); - cvxif_pkg::cvxif_req_t cvxif_req; - cvxif_pkg::cvxif_resp_t cvxif_resp; + cvxif_req_t cvxif_req; + cvxif_resp_t cvxif_resp; cva6 #( .CVA6Cfg ( CVA6Cfg ), @@ -62,7 +78,20 @@ module ariane import ariane_pkg::*; #( .axi_aw_chan_t (axi_aw_chan_t), .axi_w_chan_t (axi_w_chan_t), .noc_req_t (noc_req_t), - .noc_resp_t (noc_resp_t) + .noc_resp_t (noc_resp_t), + .readregflags_t (readregflags_t), + .writeregflags_t (writeregflags_t), + .id_t (id_t), + .hartid_t (hartid_t), + .x_compressed_req_t (x_compressed_req_t), + .x_compressed_resp_t (x_compressed_resp_t), + .x_issue_req_t (x_issue_req_t), + .x_issue_resp_t (x_issue_resp_t), + .x_register_t (x_register_t), + .x_commit_t (x_commit_t), + .x_result_t (x_result_t), + .cvxif_req_t (cvxif_req_t), + .cvxif_resp_t (cvxif_resp_t) ) i_cva6 ( .clk_i ( clk_i ), .rst_ni ( rst_ni ), @@ -81,13 +110,36 @@ module ariane import ariane_pkg::*; #( if (CVA6Cfg.CvxifEn) begin : gen_example_coprocessor cvxif_example_coprocessor #( - .CVA6Cfg ( CVA6Cfg ) + .NrRgprPorts (CVA6Cfg.NrRgprPorts), + .XLEN (CVA6Cfg.XLEN), + .readregflags_t (readregflags_t), + .writeregflags_t (writeregflags_t), + .id_t (id_t), + .hartid_t (hartid_t), + .x_compressed_req_t (x_compressed_req_t), + .x_compressed_resp_t (x_compressed_resp_t), + .x_issue_req_t (x_issue_req_t), + .x_issue_resp_t (x_issue_resp_t), + .x_register_t (x_register_t), + .x_commit_t (x_commit_t), + .x_result_t (x_result_t), + .cvxif_req_t (cvxif_req_t), + .cvxif_resp_t (cvxif_resp_t) ) i_cvxif_coprocessor ( .clk_i ( clk_i ), .rst_ni ( rst_ni ), .cvxif_req_i ( cvxif_req ), .cvxif_resp_o ( cvxif_resp ) ); + end else begin + always_comb begin + cvxif_resp = '0; + cvxif_resp.compressed_ready = 1'b1; + cvxif_resp.issue_ready = 1'b1; + cvxif_resp.register_ready = 1'b1; + end end + + endmodule // ariane diff --git a/corev_apu/tb/ariane_tb.sv b/corev_apu/tb/ariane_tb.sv index 18cedeefe3..22e3276512 100644 --- a/corev_apu/tb/ariane_tb.sv +++ b/corev_apu/tb/ariane_tb.sv @@ -54,7 +54,7 @@ module ariane_tb; // toggle with RTC period localparam int unsigned RTC_CLOCK_PERIOD = 30.517us; - localparam NUM_WORDS = 2**16; + localparam NUM_WORDS = 2**18; logic clk_i; logic rst_ni; logic rtc_i; diff --git a/corev_apu/tb/common/mock_uart.sv b/corev_apu/tb/common/mock_uart.sv index 6a14904b11..0124ea714e 100644 --- a/corev_apu/tb/common/mock_uart.sv +++ b/corev_apu/tb/common/mock_uart.sv @@ -40,15 +40,15 @@ module mock_uart ( localparam THRE = 5; // transmit holding register empty localparam TEMT = 6; // transmit holding register empty - byte lcr = 0; - byte dlm = 0; - byte dll = 0; - byte mcr = 0; - byte lsr = 0; - byte ier = 0; - byte msr = 0; - byte scr = 0; - logic fifo_enabled = 1'b0; + byte lcr; + byte dlm; + byte dll; + byte mcr; + byte lsr; + byte ier; + byte msr; + byte scr; + logic fifo_enabled; assign pready_o = 1'b1; assign pslverr_o = 1'b0; @@ -62,7 +62,17 @@ module mock_uart ( /* verilator lint_off WIDTHCONCAT */ always_ff @(posedge clk_i or negedge rst_ni) begin - if (rst_ni) begin + if (!rst_ni) begin + lcr <= 0; + dlm <= 0; + dll <= 0; + mcr <= 0; + lsr <= 0; + ier <= 0; + msr <= 0; + scr <= 0; + fifo_enabled <= 1'b0; + end else begin if (psel_i & penable_i & pwrite_i) begin case ((paddr_i >> 'h2) & 'h7) THR: begin diff --git a/corev_apu/tb/common/spike.sv b/corev_apu/tb/common/spike.sv index 3250ce9ec4..1d1fb9ceda 100644 --- a/corev_apu/tb/common/spike.sv +++ b/corev_apu/tb/common/spike.sv @@ -49,15 +49,16 @@ module spike #( st_core_cntrl_cfg st; bit sim_finished; + string core_name = "cva6"; initial begin - string core_name = "cva6"; st = cva6pkg_to_core_cntrl_cfg(st); st.boot_addr_valid = 1'b1; st.boot_addr = 64'h0x10000; if ($test$plusargs("core_name")) begin $value$plusargs("core_name=%s", core_name); + `uvm_info("SPIKE", $sformatf("### core_name = '%s'", core_name), UVM_LOW); end rvfi_initialize(st); diff --git a/corev_apu/tb/rvfi_tracer.sv b/corev_apu/tb/rvfi_tracer.sv index 57dec61de1..b48a4836db 100644 --- a/corev_apu/tb/rvfi_tracer.sv +++ b/corev_apu/tb/rvfi_tracer.sv @@ -113,6 +113,7 @@ module rvfi_tracer #( rvfi_i[i].mem_paddr == TOHOST_ADDR && rvfi_i[i].mem_wdata[0] == 1'b1) begin end_of_test_q <= rvfi_i[i].mem_wdata[31:0]; + $display("*** [rvfi_tracer] INFO: Simulation terminated after %d cycles!\n", cycles); end end end diff --git a/corev_apu/tb/tb_cva6_icache/hdl/tlb_emul.sv b/corev_apu/tb/tb_cva6_icache/hdl/tlb_emul.sv index 96bd905c32..a37a4d1c28 100644 --- a/corev_apu/tb/tb_cva6_icache/hdl/tlb_emul.sv +++ b/corev_apu/tb/tb_cva6_icache/hdl/tlb_emul.sv @@ -36,7 +36,7 @@ always_ff @(posedge clk_i or negedge rst_ni) begin : p_tlb_rand automatic int rnd = 0; assert(TlbRandHitRate<=100 && TlbRandHitRate>=0) else - $fatal("TlbRandHitRate must be a percentage"); + $fatal(1, "TlbRandHitRate must be a percentage"); if(~rst_ni) begin tlb_ready_q <= '0; diff --git a/docs/.gitignore b/docs/.gitignore index f7dfd9a8a7..b672d6394e 100644 --- a/docs/.gitignore +++ b/docs/.gitignore @@ -1,3 +1,4 @@ *~ /_build *.swp +*.html diff --git a/docs/01_cva6_user/AXI_Interface.rst b/docs/01_cva6_user/AXI_Interface.rst index 35af726c6c..09c8f51a30 100644 --- a/docs/01_cva6_user/AXI_Interface.rst +++ b/docs/01_cva6_user/AXI_Interface.rst @@ -95,7 +95,7 @@ Table 2.1 shows the global AXI memory interface signals. - Clock source - | Global clock signal. Synchronous signals are sampled on the | rising edge of the global clock. - * - **WDATA** + * - **ARESETn** - Reset source - | Global reset signal. This signal is active-LOW. diff --git a/docs/01_cva6_user/CVX_Interface_Coprocessor.rst b/docs/01_cva6_user/CVX_Interface_Coprocessor.rst index 2a33f34b67..ede928b039 100644 --- a/docs/01_cva6_user/CVX_Interface_Coprocessor.rst +++ b/docs/01_cva6_user/CVX_Interface_Coprocessor.rst @@ -1,4 +1,4 @@ -.. +.. Copyright (c) 2023 OpenHW Group Copyright (c) 2023 Thales @@ -198,8 +198,229 @@ Never let the CV-X-IF interface unconnected with the ``CVA6ConfigCvxifEn`` param How to design a coprocessor for the CV-X-IF interface ----------------------------------------------------- -*The team is looking for a contributor to write this section.* + +We can add a custom coprocessor that implements custom instructions by modifying the example coprocessor in this repository. +This section is structured as a tutorial to implement two instructions that manipulate binary-coded decimal numbers. +That is, numbers where each 4-bit nibble represents a single base-10 digit with the value 0-9. +For example, 123 in decimal = 0x7B in hexadecimal = 0x123 in binary-coded decimal. + +#. Specify your new instructions + + The example coprocessor defines instructions for both the custom 0 and custom 1 major opcodes. Using a standard R-type format, each of these allows 1024 distinct instructions to be defined using the 7-bit funct7 field and the 3-bit funct3 field. + + .. image:: rtype_format.png + :width: 400 + :alt: Rtype RISC-V instruction format + + Example: + + .. code-block:: + + opcode=custom1, funct7=0x00, funct3=0x00: BCDfromBin + rf[rd] <- BCD(rf[rs1]) + Register rd is written with the binary-coded decimal equivalent of the binary integer value in rs1. + Note: rs2 is not used. + + opcode=custom1, funct7=0x00, funct3=0x01: BCDADD + rf[rd] <- ADD.BCD(rf[rs1], rf[rs2]) + Register rd is written with a binary-coded decimal (BCD) sum of BCD integers in registers rs1 and rs2. + + Note: The existing CVA6 example supports only register-to-register instructions with up to three + source registers and a single destination register. New memory operations will need substantial modifications + to the coprocessor and CVA6 system-on-chip. + +#. Branch CVA6 repo + + .. code-block:: + + git branch new_coprocessor + git checkout new_coprocessor + +#. Specialise the decoder function in core/cvxif_example/include/cvxif_instr_pkg.sv + + Example new lines in cvxif_instr_pkg: + + At the top, specify opcodes for our new instructions: + + .. code-block:: + + typedef enum logic [3:0] { + ILLEGAL = 4'b0000, // This one is mandatory, as we need a fall-through case that = 0. + BCDfromBIN = 4'b0001, + BCDADD = 4'b0010 + } opcode_t; + + Now define decode behavior for our two new instructions: + + .. code-block:: + + // 2 new RISCV instructions for our Coprocessor + parameter int unsigned NbInstr = 2; + parameter copro_issue_resp_t CoproInstr[NbInstr] = '{ + '{ + // Custom BCDfromBIN : BCDfromBIN rd, rs1 + instr:32'b0000000_00000_00000_000_00000_0101011, // custom1 opcode + mask: 32'b1111111_00000_00000_111_00000_1111111, + resp : '{accept : 1'b1, writeback : 1'b1, // This instruction will write a register + register_read : {1'b0, 1'b0, 1'b1}}, // Use rs1 for input + opcode : BCDfromBIN + }, + '{ + // Custom BCDADD : BCDADD rd, rs1, rs2 + instr:32'b0000000_00000_00000_001_00000_0101011, // custom1 opcode + mask: 32'b1111111_00000_00000_111_00000_1111111, + resp : '{accept : 1'b1, writeback : 1'b1, // This instruction will write a register + register_read : {1'b0, 1'b1, 1'b1}}, // Use rs1 and rs2 for input + opcode : BCDADD + } + }; + + We should also introduce a null compressed instruction, as we have not specified one. + + .. code-block:: + + // No compressed instructions for our Coprocessor, but must have a NULL entry. + parameter int unsigned NbCompInstr = 1; + parameter copro_compressed_resp_t CoproCompInstr[NbCompInstr] = '{ + // NULL Pattern + '{ + instr : 16'b0000_0000_0000_0000, + mask : 16'b0000_0000_0000_0000, + resp : '{accept : 1'b0, // Do not accept! + instr : 32'b0000_0000_0000_0000_0000_0000_0000_0000} + } + }; + +4. Write execution logic in core/cvxif_example/cppro_alu.sv + + Example new lines in cppro_alu.sv: + + .. code-block:: + + localparam W = X_RFR_WIDTH; + function automatic logic [W-1:0] BCDfromBin (logic [W-1:0] bin); + // Code adapted from https://en.wikipedia.org/wiki/Double_dabble + logic [W+(W-4)/3:0] bcd = 0; // initialize with zeros + bcd[W-1:0] = bin; // initialize with input vector + for(int i = 0; i <= W-4; i = i+1) // iterate on structure depth + for(int j = 0; j <= i/3; j = j+1) // iterate on structure width + if (bcd[W-i+4*j -: 4] > 4) // if > 4 + bcd[W-i+4*j -: 4] = bcd[W-i+4*j -: 4] + 4'd3; // add 3 + return bcd[W-1:0]; + endfunction + function automatic logic [W-1:0] BCDADD (logic [W-1:0] x, logic [W-1:0] y); + logic [W-1:0] sum; // full sum result + logic [4:0] tmp = 0; // temporary digit result (could be up to 9+9+8=24) + logic [3:0] c = 0; // carry bits + for(int i = 3; i= 10) begin // Add one to carry for each "10" in temp. + c += 1; + tmp = tmp - 10; // Leave tmp less than 10. + end + sum[i-:4] = tmp[3:0] ; + end + return sum; + endfunction + + In final always_comb block of cppro_alu.sv, modify the case statement: + + .. code-block:: + + case (opcode_i) + cvxif_instr_pkg::BCDfromBIN: begin + result_n = BCDfromBin(registers_i[0]); + hartid_n = hartid_i; + id_n = id_i; + valid_n = 1'b1; + rd_n = rd_i; + we_n = 1'b1; + end + cvxif_instr_pkg::BCDADD: begin + result_n = BCDADD(registers_i[0], registers_i[1]); + hartid_n = hartid_i; + id_n = id_i; + valid_n = 1'b1; + rd_n = rd_i; + we_n = 1'b1; + end + default: begin + ... + + Note: To support new memory operations, the memory interface would be needed + in this coprocessor to load and store from the main pipeline. Alternatively, + one could add a dedicated memory interface to the coprocessor, though care would + need to be taken for memory coherence and consistency with the data cache. + +5. Write a simple test + + For example, add the following to verif/tests/custom/cv_xif/cvxif_macros.h: + + .. code-block:: + + #define CUS_BCDfromBin(rs1,rd) .word 0b####000000000000##rs1##000##rd##0101011 + #define CUS_BCDADD(rs1,rs2,rd) .word 0b####0000000##rs2####rs1##001##rd##0101011 + + Copy similar test: + + .. code-block:: + + cp verif/tests/custom/cv_xif/cvxif_add_nop.S verif/tests/custom/cv_xif/cvxif_bcd.S + + Change the body of the test: + + .. code-block:: + + // core of the test + + // Load constant values into a0 and a1 + LOAD_RS(a0, 12345678); + LOAD_RS(a1, 23456789); + + // Transform a0 and a1 into BCD form + CUS_BCDfromBin(01010,01010); // a0 = 5'b01010 + CUS_BCDfromBin(01011,01011); // a1 = 5'b01011 + + // Perform BCD add on the operands into a2 and a3 + CUS_BCDADD(01010,01011,01100); + CUS_BCDADD(01011,01010,01101); + + // (example of) final self-check test + xor a2, a3, a2; + beqz a2, pass; + +6. Now build a simulation and run it + + Example: + + .. code-block:: + + cd ~/cva6/verif/sim + export DV_SIMULATORS=veri-testharness + TRACE_FAST=1 python3 cva6.py --target cv64a6_imafdc_sv39 \ + --iss=$DV_SIMULATORS --iss_yaml=cva6.yaml \ + --asm_tests ../tests/custom/cv_xif/cvxif_bcd.S \ + --linker=../tests/custom/common/test.ld \ + --gcc_opts="-static -mcmodel=medany \ + -fvisibility=hidden -nostdlib \ + -nostartfiles -g ../tests/custom/common/syscalls.c \ + ../tests/custom/common/crt.S -lgcc \ + -I../tests/custom/env -I../tests/custom/common" + + Check verilog build errors in verif/sim/out_*/veri-testharness_sim/cvxif_bcd.cv64a6_imafdc_sv39.log.iss. + + Check instruction trace of the execution in verif/sim/out_*/veri-testharness_sim/cvxif_bcd.cv64a6_imafdc_sv39.log. + + View the simulated waveform output using: + + .. code-block:: + + gtkwave verif/sim/out_*/veri-testharness_sim/cvxif_bcd.cv64a6_imafdc_sv39.vcd + + The signals in TOP.ariane_testharness.i_ariane.cvxif_req/resp should be useful. How to program a CV-X-IF coprocessor ------------------------------------- -*The team is looking for a contributor to write this section.* +------------------------------------ +*The team is looking for a contributor to write this section.* diff --git a/docs/01_cva6_user/Parameters_Configuration.rst b/docs/01_cva6_user/Parameters_Configuration.rst index fc8a001127..b3117a6b8a 100644 --- a/docs/01_cva6_user/Parameters_Configuration.rst +++ b/docs/01_cva6_user/Parameters_Configuration.rst @@ -27,58 +27,15 @@ Main contributor: Jean-Roch Coulon - Thales Parameters ---------- -.. csv-table:: - :widths: auto - :align: left - :header: "Parameter", "Category", "Description" - - "``Cva6MArchID``", "Archi", "Cva6 architecture ID" - "``Xlen``", "Variant", "Data length" - "``CExtEn``", "Variant", "C extension enable" - "``AExtEn``", "Variant", "A extension enable" - "``BMExtEn``", "Variant", "Bit Manipulation extension enable" - "``FpuEn``", "Variant", "FPU enable" - "``F16En``", "Variant", "FPU 16bits enable" - "``F16AltEn``", "Variant", "FPU Alt 16bits enable" - "``F8En``", "Variant", "FPU 8bits enable" - "``FVecEn``", "Variant", "Vector FPU enable" - "``MMUEn``", "Memory", "MMU Present" - "``InstrTlbEntries``", "Memory", "Instruction TLB entry number" - "``DataTlbEntries``", "Memory", "Data TLB entry number" - "``RASDepth``", "Memory", "Depth of Return Address Stack" - "``BTBEntries``", "Memory", "BTB entry number" - "``BHTEntries``", "Memory", "BHT entry number" - "``NrNonIdempotentRules``", "Memory", "Number of non idempotent region" - "``NonIdempotentAddrBase``", "Memory", "Base address of non idempotent region" - "``NonIdempotentLength``", "Memory", "Length of non idempotent region" - "``NrExecuteRegionRules``", "Memory", "Number of excution regions" - "``ExecuteRegionAddrBase``", "Memory", "Execution region of base address (DRAM, Boot ROM and Debug Module)" - "``ExecuteRegionLength``", "Memory", "Length of execution region" - "``NrCachedRegionRules``", "Memory", "Number of cached region" - "``CachedRegionAddrBase``", "Memory", "Base address of cached region" - "``CachedRegionLength``", "Memory", "Length of cached regions" - "``NrPMPEntries``", "Memory", "Number of PMP entries" - "``DmBaseAddress``", "Debug", "Base address of debug" - "``CvxifEn``", "Ports", "CV-X-IF interface enable" - "``RVFI_TRACE (define)``", "Ports", "RVFI interface enable" - "``FIRESIM_TRACE (define)``", "Ports", "FIRESIM interface enable" - "``PITON_ARIANE (define)``", "Ports", "Piton interface enable, and AXI interface disable" - "``WT_CACHE (define)``", "Caches", "Write through cache enable, write back cache disable" - "``DepthStoreBuffer``", "Caches", "Depth of store buffer" - "``IcacheSetAssoc``", "Caches", "Instruction cache way number" - "``DcacheSetAssoc``", "Caches", "Data cache way number" - "``NrLoadPipeRegs``", "Caches", "Number of stall on load operation" - "``NrStorePipeRegs``", "Caches", "Number of stall on store operation" - "``AxiCompliant``", "Caches", "Cache configuration: AXI or XXXX" - "``SwapEndianess``", "Caches", "Endianess of cache: XXXX" - "``FetchUserEn``", "Users", "Fetch AXI user bit enable" - "``FetchUserWidth``", "Users", "Fetch user bit number when enabled" - "``DataUserEn``", "Users", "Data AXI user bit enable" - "``DataUserWidth``", "Users", "Data user bit number when enabled" - "``RenameEn``", "Pipeline", "Register renaming feature enable" - "``NrCommitPorts``", "Pipeline", "Commit port number" - "``NrScoreboardEntries``", "Pipeline", "Scoreboard entry number" - "``FpgaEn``", "Technology", "FPGA optimization enable" +.. include:: user_cfg_doc.rst + +\*: Some parameters are incompatible with others: + +- ``SuperscalarEn``: + + - Not compatible with floating point (``RVF``, ``RVD``, ``XF16``, ``XF16ALT``, ``XF8``, ``XFVec``) yet. + - Not compatible with macro instructions (``RVZCMP``) yet. + - Recommended to set ``NrScoreboardEntries`` to at least 8 for performance. Configurations diff --git a/docs/01_cva6_user/rtype_format.png b/docs/01_cva6_user/rtype_format.png new file mode 100644 index 0000000000..cb0f1f2ee2 Binary files /dev/null and b/docs/01_cva6_user/rtype_format.png differ diff --git a/docs/01_cva6_user/user_cfg_doc.rst b/docs/01_cva6_user/user_cfg_doc.rst new file mode 100644 index 0000000000..d0e52b1a1b --- /dev/null +++ b/docs/01_cva6_user/user_cfg_doc.rst @@ -0,0 +1,321 @@ +.. + Copyright 2024 Thales DIS France SAS + Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); + you may not use this file except in compliance with the License. + SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 + You may obtain a copy of the License at https://solderpad.org/licenses/ + + Original Author: Jean-Roch COULON - Thales + +.. _cva6_user_cfg_doc: + +.. list-table:: ``cva6_user_cfg_t`` parameters + :header-rows: 1 + + * - Name + - Type + - Description + + * - ``XLEN`` + - ``int unsigned`` + - General Purpose Register Size (in bits) + + * - ``RVA`` + - ``bit`` + - Atomic RISC-V extension + + * - ``RVB`` + - ``bit`` + - Bit manipulation RISC-V extension + + * - ``RVV`` + - ``bit`` + - Vector RISC-V extension + + * - ``RVC`` + - ``bit`` + - Compress RISC-V extension + + * - ``RVH`` + - ``bit`` + - Hypervisor RISC-V extension + + * - ``RVZCB`` + - ``bit`` + - Zcb RISC-V extension + + * - ``RVZCMP`` + - ``bit`` + - Zcmp RISC-V extension + + * - ``RVZiCond`` + - ``bit`` + - Zicond RISC-V extension + + * - ``RVZicntr`` + - ``bit`` + - Zicntr RISC-V extension + + * - ``RVZihpm`` + - ``bit`` + - Zihpm RISC-V extension + + * - ``RVF`` + - ``bit`` + - Floating Point + + * - ``RVD`` + - ``bit`` + - Floating Point + + * - ``XF16`` + - ``bit`` + - Non standard 16bits Floating Point extension + + * - ``XF16ALT`` + - ``bit`` + - Non standard 16bits Floating Point Alt extension + + * - ``XF8`` + - ``bit`` + - Non standard 8bits Floating Point extension + + * - ``XFVec`` + - ``bit`` + - Non standard Vector Floating Point extension + + * - ``PerfCounterEn`` + - ``bit`` + - Perf counters + + * - ``MmuPresent`` + - ``bit`` + - MMU + + * - ``RVS`` + - ``bit`` + - Supervisor mode + + * - ``RVU`` + - ``bit`` + - User mode + + * - ``DebugEn`` + - ``bit`` + - Debug support + + * - ``DmBaseAddress`` + - ``logic [63:0]`` + - Base address of the debug module + + * - ``HaltAddress`` + - ``logic [63:0]`` + - Address to jump when halt request + + * - ``ExceptionAddress`` + - ``logic [63:0]`` + - Address to jump when exception + + * - ``TvalEn`` + - ``bit`` + - Tval Support Enable + + * - ``DirectVecOnly`` + - ``bit`` + - MTVEC CSR supports only direct mode + + * - ``NrPMPEntries`` + - ``int unsigned`` + - PMP entries number + + * - ``PMPCfgRstVal`` + - ``logic [63:0][63:0]`` + - PMP CSR configuration reset values + + * - ``PMPAddrRstVal`` + - ``logic [63:0][63:0]`` + - PMP CSR address reset values + + * - ``PMPEntryReadOnly`` + - ``bit [63:0]`` + - PMP CSR read-only bits + + * - ``NrNonIdempotentRules`` + - ``int unsigned`` + - PMA non idempotent rules number + + * - ``NonIdempotentAddrBase`` + - ``logic [NrMaxRules-1:0][63:0]`` + - PMA NonIdempotent region base address + + * - ``NonIdempotentLength`` + - ``logic [NrMaxRules-1:0][63:0]`` + - PMA NonIdempotent region length + + * - ``NrExecuteRegionRules`` + - ``int unsigned`` + - PMA regions with execute rules number + + * - ``ExecuteRegionAddrBase`` + - ``logic [NrMaxRules-1:0][63:0]`` + - PMA Execute region base address + + * - ``ExecuteRegionLength`` + - ``logic [NrMaxRules-1:0][63:0]`` + - PMA Execute region address base + + * - ``NrCachedRegionRules`` + - ``int unsigned`` + - PMA regions with cache rules number + + * - ``CachedRegionAddrBase`` + - ``logic [NrMaxRules-1:0][63:0]`` + - PMA cache region base address + + * - ``CachedRegionLength`` + - ``logic [NrMaxRules-1:0][63:0]`` + - PMA cache region rules + + * - ``CvxifEn`` + - ``bit`` + - CV-X-IF coprocessor interface enable + + * - ``NOCType`` + - ``noc_type_e`` + - NOC bus type + + * - ``AxiAddrWidth`` + - ``int unsigned`` + - AXI address width + + * - ``AxiDataWidth`` + - ``int unsigned`` + - AXI data width + + * - ``AxiIdWidth`` + - ``int unsigned`` + - AXI ID width + + * - ``AxiUserWidth`` + - ``int unsigned`` + - AXI User width + + * - ``AxiBurstWriteEn`` + - ``bit`` + - AXI burst in write + + * - ``MemTidWidth`` + - ``int unsigned`` + - TODO + + * - ``IcacheByteSize`` + - ``int unsigned`` + - Instruction cache size (in bytes) + + * - ``IcacheSetAssoc`` + - ``int unsigned`` + - Instruction cache associativity (number of ways) + + * - ``IcacheLineWidth`` + - ``int unsigned`` + - Instruction cache line width + + * - ``DCacheType`` + - ``cache_type_t`` + - Cache Type + + * - ``DcacheIdWidth`` + - ``int unsigned`` + - Data cache ID + + * - ``DcacheByteSize`` + - ``int unsigned`` + - Data cache size (in bytes) + + * - ``DcacheSetAssoc`` + - ``int unsigned`` + - Data cache associativity (number of ways) + + * - ``DcacheLineWidth`` + - ``int unsigned`` + - Data cache line width + + * - ``DataUserEn`` + - ``int unsigned`` + - User field on data bus enable + + * - ``WtDcacheWbufDepth`` + - ``int unsigned`` + - Write-through data cache write buffer depth + + * - ``FetchUserEn`` + - ``int unsigned`` + - User field on fetch bus enable + + * - ``FetchUserWidth`` + - ``int unsigned`` + - Width of fetch user field + + * - ``FpgaEn`` + - ``bit`` + - Is FPGA optimization of CV32A6 + + * - ``TechnoCut`` + - ``bit`` + - Is Techno Cut instanciated + + * - ``SuperscalarEn`` + - ``bit`` + - Enable superscalar* with 2 issue ports and 2 commit ports. + + * - ``NrCommitPorts`` + - ``int unsigned`` + - Number of commit ports. Forced to 2 if SuperscalarEn. + + * - ``NrLoadPipeRegs`` + - ``int unsigned`` + - Load cycle latency number + + * - ``NrStorePipeRegs`` + - ``int unsigned`` + - Store cycle latency number + + * - ``NrScoreboardEntries`` + - ``int unsigned`` + - Scoreboard length + + * - ``NrLoadBufEntries`` + - ``int unsigned`` + - Load buffer entry buffer + + * - ``MaxOutstandingStores`` + - ``int unsigned`` + - Maximum number of outstanding stores + + * - ``RASDepth`` + - ``int unsigned`` + - Return address stack depth + + * - ``BTBEntries`` + - ``int unsigned`` + - Branch target buffer entries + + * - ``BHTEntries`` + - ``int unsigned`` + - Branch history entries + + * - ``InstrTlbEntries`` + - ``int unsigned`` + - MMU instruction TLB entries + + * - ``DataTlbEntries`` + - ``int unsigned`` + - MMU data TLB entries + + * - ``UseSharedTlb`` + - ``bit unsigned`` + - MMU option to use shared TLB + + * - ``SharedTlbDepth`` + - ``int unsigned`` + - MMU depth of shared TLB diff --git a/docs/02_cva6_requirements/cva6_requirements_specification.rst b/docs/02_cva6_requirements/cva6_requirements_specification.rst index d0170bb583..da5605efbb 100644 --- a/docs/02_cva6_requirements/cva6_requirements_specification.rst +++ b/docs/02_cva6_requirements/cva6_requirements_specification.rst @@ -204,8 +204,7 @@ https://github.com/riscv-non-isa/riscv-arch-test. [AXI] AXI Specification, https://developer.arm.com/documentation/ihi0022/hc. -[CV-X-IF] Placeholder for the CV-X-IF coprocessor interface currently -prepared at OpenHW Group; current version in +[CV-X-IF] “OpenHW Group Specification: Core-V eXtension interface (CV-X-IF)”, version 1.0.0, https://docs.openhwgroup.org/projects/openhw-group-core-v-xif/. [OpenPiton] “OpenPiton Microarchitecture Specification”, Princeton @@ -304,9 +303,11 @@ independent requirements. | | 2.0. | +-----------------------------------+-----------------------------------+ | ISA-120 | CVA6 should support as an | -| | **option** the **Zba**, **Zbb**, | -| | **Zbc** and **Zbs** extensions | +| | **option** the **B** extension | | | (bit manipulation), version 1.0. | +| | The **B** extension comprises the | +| | **Zba**, **Zbb**, **Zbc** | +| | and **Zbs** extensions. | +-----------------------------------+-----------------------------------+ | ISA-130 | CVA6 should support as an | | | **option** the **Zicond** | @@ -614,15 +615,15 @@ work. If a RISC-V specification is ratified, the CVA6 specification will likely switch to it. +-----------------------------------+-----------------------------------+ -| FET‑10 | CVA6 shall support the | +| FET‑10 | CVA6 should support the | | | ``FENCE.T`` instruction that | | | ensures that the execution time | | | of subsequent instructions is | | | unrelated with predecessor | | | instructions. | +-----------------------------------+-----------------------------------+ -| FET‑20 | ``FENCE.T`` shall be available in | -| | all privilege modes (machine, | +| FET‑20 | ``FENCE.T`` should be available | +| | in all privilege modes (machine, | | | supervisor, user and hypervisor | | | if present). | +-----------------------------------+-----------------------------------+ @@ -637,11 +638,6 @@ used to select a subset of microarchitecture features that will be cleared. The list of arguments, if any, will be detailed in the user’s guide. -Anticipation of verification: It can be cumbersome to prove the timing -decorrelation as expressed in the requirement with digital simulations. -We can simulate the microarchitecture features and explain how they -satisfy the requirement as Nils Wistoff’s work demonstrated. - .. _ppa_targets: PPA targets @@ -752,13 +748,6 @@ Coprocessor interface | | [CV-X-IF] specification. | +-----------------------------------+-----------------------------------+ -The goal is to have a compatible interface between CORE-V cores (CVA6, -CV32E40X…). The feasibility still needs to be confirmed; including the -speculative execution. - -CVA6 can interface with several coprocessors simultaneously through a -specific external feature implemented on the CV-X-IF interface. - .. _multi_core_interface: Multi-core interface diff --git a/docs/04_cv32a65x/design/Makefile b/docs/04_cv32a65x/design/Makefile index 974e00577e..b48cea0b29 100644 --- a/docs/04_cv32a65x/design/Makefile +++ b/docs/04_cv32a65x/design/Makefile @@ -1,38 +1,10 @@ +# Copyright 2024 Thales DIS France SAS +# Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); +# you may not use this file except in compliance with the License. +# SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 +# You may obtain a copy of the License at https://solderpad.org/licenses/ # -# Copyright (c) 2020 OpenHW Group -# -# Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://solderpad.org/licenses/ -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0 -# -############################################################################### -# -# Minimal makefile for Sphinx documentation -# - -# You can set these variables from the command line. -SPHINXOPTS = -SPHINXBUILD = sphinx-build -SOURCEDIR = source -BUILDDIR = build - -# Put it first so that "make" without argument is like "make help". -help: - @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) - -.PHONY: help Makefile +# Original Author: Thales DIS -# Catch-all target: route all unknown targets to Sphinx using the new -# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). -%: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) +CONFIG := cv32a65x +include ../../design/build.mk diff --git a/docs/04_cv32a65x/design/source/traps.rst b/docs/04_cv32a65x/design/design.rst similarity index 60% rename from docs/04_cv32a65x/design/source/traps.rst rename to docs/04_cv32a65x/design/design.rst index 4f2988563c..082d32e3a1 100644 --- a/docs/04_cv32a65x/design/source/traps.rst +++ b/docs/04_cv32a65x/design/design.rst @@ -1,5 +1,5 @@ .. - Copyright 2023 Thales DIS France SAS + Copyright (c) 2024 Thales Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); you may not use this file except in compliance with the License. SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0 @@ -7,5 +7,16 @@ Original Author: Jean-Roch COULON - Thales +CV32A65X DESIGN DOCUMENT +======================== -.. include:: ../../../01_cva6_user/Traps_Interrupts_Exceptions.rst +.. raw:: html + + + +.. raw:: html + :file: design-cv32a65x.html diff --git a/docs/04_cv32a65x/design/make.bat b/docs/04_cv32a65x/design/make.bat deleted file mode 100644 index 543c6b13b4..0000000000 --- a/docs/04_cv32a65x/design/make.bat +++ /dev/null @@ -1,35 +0,0 @@ -@ECHO OFF - -pushd %~dp0 - -REM Command file for Sphinx documentation - -if "%SPHINXBUILD%" == "" ( - set SPHINXBUILD=sphinx-build -) -set SOURCEDIR=source -set BUILDDIR=build - -if "%1" == "" goto help - -%SPHINXBUILD% >NUL 2>NUL -if errorlevel 9009 ( - echo. - echo.The 'sphinx-build' command was not found. Make sure you have Sphinx - echo.installed, then set the SPHINXBUILD environment variable to point - echo.to the full path of the 'sphinx-build' executable. Alternatively you - echo.may add the Sphinx directory to PATH. - echo. - echo.If you don't have Sphinx installed, grab it from - echo.http://sphinx-doc.org/ - exit /b 1 -) - -%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% -goto end - -:help -%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% - -:end -popd diff --git a/docs/04_cv32a65x/design/requirements.txt b/docs/04_cv32a65x/design/requirements.txt deleted file mode 100644 index ed9ee59efb..0000000000 --- a/docs/04_cv32a65x/design/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -sphinx -sphinx-rtd-theme -recommonmark -sphinxcontrib-svg2pdfconverter -sphinx_github_changelog diff --git a/docs/04_cv32a65x/design/source/CSRs.rst b/docs/04_cv32a65x/design/source/CSRs.rst deleted file mode 100644 index a3dd55d115..0000000000 --- a/docs/04_cv32a65x/design/source/CSRs.rst +++ /dev/null @@ -1 +0,0 @@ -.. include:: ../../../../config/gen_from_riscv_config/cv32a65x/csr/csr.rst diff --git a/docs/04_cv32a65x/design/source/conf.py b/docs/04_cv32a65x/design/source/conf.py deleted file mode 100644 index 976ee3774a..0000000000 --- a/docs/04_cv32a65x/design/source/conf.py +++ /dev/null @@ -1,214 +0,0 @@ -# -*- coding: utf-8 -*- -# -# Copyright (c) 2020 OpenHW Group -# -# Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# https://solderpad.org/licenses/ -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0 -# -############################################################################### -# -# Configuration file for the Sphinx documentation builder. -# -# This file does only contain a selection of the most common options. For a -# full list see the documentation: -# http://www.sphinx-doc.org/en/master/config - -# -- Path setup -------------------------------------------------------------- - -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -# -# import os -# import sys -# sys.path.insert(0, os.path.abspath('.')) - - -# -- Project information ----------------------------------------------------- - -project = u'CORE-V CV32A6 v0.1.0 Design Document' -copyright = u'2022, Thales Group' -author = u'Thales and OpenHW Group' - -# The short X.Y version -version = u'' -# The full version, including alpha/beta/rc tags -release = u'' - - -# -- General configuration --------------------------------------------------- - -# If your documentation needs a minimal Sphinx version, state it here. -# -# needs_sphinx = '1.0' - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [ - 'sphinx.ext.autodoc', - 'sphinx.ext.todo', - 'recommonmark', - 'sphinxcontrib.inkscapeconverter', - 'sphinx_github_changelog', -# 'sphinxcontrib.wavedrom', -] -#wavedrom_html_jsinline = False - -# Add any paths that contain templates here, relative to this directory. -templates_path = ['ytemplates'] - -# The suffix(es) of source filenames. -# You can specify multiple suffix as a list of string: -# -# source_suffix = ['.rst', '.md'] -source_suffix = '.rst' - -# The master toctree document. -master_doc = 'index' - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -# -# This is also used if you do content translation via gettext catalogs. -# Usually you set "language" from the command line for these cases. -language = 'en' - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# This pattern also affects html_static_path and html_extra_path. -exclude_patterns = [] - -# Numbering -numfig=True -numfig_format = {'figure': 'Figure %s', 'table': 'Table %s', 'code-block': 'Listing %s'} - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = None - - -# -- Options for HTML output ------------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -# -#html_theme = 'alabaster' -html_theme = 'sphinx_rtd_theme' - -# Theme options are theme-specific and customize the look and feel of a theme -# further. For a list of options available for each theme, see the -# documentation. -# -html_theme_options = {'style_nav_header_background': '#DDDDDD'} -html_logo = '../images/openhw-landscape.svg' - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -#html_static_path = ['ystatic'] -# Set html_static_path to null on the advice of RTDs: -html_static_path = [] - -# Custom sidebar templates, must be a dictionary that maps document names -# to template names. -# -# The default sidebars (for documents that don't match any pattern) are -# defined by theme itself. Builtin themes are using these templates by -# default: ``['localtoc.html', 'relations.html', 'sourcelink.html', -# 'searchbox.html']``. -# -# html_sidebars = {} - - -# -- Options for HTMLHelp output --------------------------------------------- - -# Output file base name for HTML help builder. -htmlhelp_basename = 'CORE-V_CV32A6_V0.1.0_DESIGN_DOC' - - -# -- Options for LaTeX output ------------------------------------------------ - -latex_elements = { - # The paper size ('letterpaper' or 'a4paper'). - # - # 'papersize': 'letterpaper', - - # The font size ('10pt', '11pt' or '12pt'). - # - # 'pointsize': '10pt', - - # Additional stuff for the LaTeX preamble. - # - # 'preamble': '', - - # Latex figure (float) alignment - # - # 'figure_align': 'htbp', -} - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, -# author, documentclass [howto, manual, or own class]). -latex_documents = [ - (master_doc, 'CV32A6-v0.1.0_Design_Spec.tex', u'CORE-V-Docs Documentation', - u'Jean-Roch Coulon', 'manual'), -] - - -# -- Options for manual page output ------------------------------------------ - -# One entry per manual page. List of tuples -# (source start file, name, description, authors, manual section). -man_pages = [ - (master_doc, 'CV32A6-v0.1.0_Design_Spec.tex', u'CORE-V-Docs Documentation', - [author], 1) -] - - -# -- Options for Texinfo output ---------------------------------------------- - -# Grouping the document tree into Texinfo files. List of tuples -# (source start file, target name, title, author, -# dir menu entry, description, category) -texinfo_documents = [ - (master_doc, 'CV32A6-v0.1.0_Design_Spec.tex', u'CORE-V-Docs Documentation', - author, 'UserManual', 'User Manual for CV32A6 v0.1.0 CORE-V processor core.', - 'Miscellaneous'), -] - - -# -- Options for Epub output ------------------------------------------------- - -# Bibliographic Dublin Core info. -epub_title = project - -# The unique identifier of the text. This can be a ISBN number -# or the project homepage. -# -# epub_identifier = '' - -# A unique identification for the text. -# -# epub_uid = '' - -# A list of files that should not be packed into the epub file. -epub_exclude_files = ['search.html'] - - -# -- Extension configuration ------------------------------------------------- - -# -- Options for todo extension ---------------------------------------------- - -# If true, `todo` and `todoList` produce output, else they produce nothing. -todo_include_todos = True diff --git a/docs/04_cv32a65x/design/source/cv32a6_frontend.rst b/docs/04_cv32a65x/design/source/cv32a6_frontend.rst deleted file mode 100644 index f647a618a2..0000000000 --- a/docs/04_cv32a65x/design/source/cv32a6_frontend.rst +++ /dev/null @@ -1,236 +0,0 @@ -.. - Copyright 2021 Thales DIS design services SAS - Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0 - You may obtain a copy of the License at https://solderpad.org/licenses/ - - Original Author: Jean-Roch COULON - Thales - -.. _CV32A6_FRONTEND: - -FRONTEND Module -=============== - -Description ------------ - -The FRONTEND module implements two first stages of the cva6 pipeline, -PC gen and Fetch stages. - -PC gen stage is responsible for generating the next program counter. -It hosts a Branch Target Buffer (BTB), a Branch History Table (BHT) and a Return Address Stack (RAS) to speculate on control flow instructions. - -Fetch stage requests data to the CACHE module, realigns the data to store them in instruction queue and transmits the instructions to the DECODE module. -FRONTEND can fetch up to 2 instructions per cycles when C extension instructions is enabled, but DECODE module decodes up to one instruction per cycles. - -The module is connected to: - -* CACHES module provides fethed instructions to FRONTEND. -* DECODE module receives instructions from FRONTEND. -* CONTROLLER module can order to flush and to halt FRONTEND PC gen stage -* EXECUTE, CONTROLLER, CSR and COMMIT modules trigger PC jumping due to a branch misprediction, an exception, a return from an exception, a debug entry or a pipeline flush. - They provides the PC next value. -* CSR module states about debug mode. - -.. include:: port_frontend.rst - -Functionality -------------- - - - -PC Generation stage -~~~~~~~~~~~~~~~~~~~ - -PC gen generates the next program counter. The next PC can originate from the following sources (listed in order of precedence): - -* **Reset state:** At reset, the PC is assigned to the boot address. - -* **Branch Prediction:** The fetched instruction is predecoded by the instr_scan submodule. - When the instruction is a control flow, three cases are considered: - - 1. When the instruction is a JALR which corresponds to a return (rs1 = x1 or rs1 = x5). - RAS provides next PC as a prediction. - - 2. When the instruction is a JALR which **does not** correspond to a return. - If BTB (Branch Target Buffer) returns a valid address, then BTB predicts next PC. - Else JALR is not considered as a control flow instruction, which will generate a mispredict. - - 3. When the instruction is a conditional branch. - If BHT (Branch History table) returns a valid address, then BHT predicts next PC. - Else the prediction depends on the PC relative jump offset sign: if sign is negative the prediction is taken, otherwise the prediction is not taken. - - Then the PC gen informs the Fetch stage that it performed a prediction on the PC. - -* **Default:** The next 32-bit block is fetched. - PC Gen fetches word boundary 32-bits block from CACHES module. And the fetch stage identifies the instructions from the 32-bits blocks. - -* **Mispredict:** Misprediction are feedbacked by EX_STAGE module. - In any case we need to correct our action and start fetching from the correct address. - -* **Replay instruction fetch:** When the instruction queue is full, the instr_queue submodule asks the fetch replay and provides the address to be replayed. - -* **Return from environment call:** When CSR requests a return from an environment call, next PC takes the value of the PC of the instruction after the one pointed to by the mepc CSR. - -* **Exception/Interrupt:** If an exception is triggered by CSR_REGISTER, next PC takes the value of the trap vector base address CSR. - -* **Pipeline starting fetching from COMMIT PC:** When the commit stage is halted by a WFI instruction or when the pipeline has been flushed due to CSR change, next PC takes the value of the PC coming from the COMMIT submodule. - As CSR instructions do not exist in a compressed form, PC is unconditionally incremented by 4. - -.. user and supervisor modes are not supported by CV32A65X - The trap vector base address can be different depending on whether the exception traps to S-Mode or M-Mode (user mode exceptions are currently not supported). - It is the purpose of the CSR Unit to figure out where to trap to and present the correct address to PC Gen. - -.. Debug feature is not supported by CV32A65X - * **Debug:** Debug has the highest order of precedence as it can interrupt any control flow requests. It also the only source of control flow change which can actually happen simultaneously to any other of the forced control flow changes. - The debug jump is requested by CSR. - The address to be jumped into is HW coded. - - -All program counters are logical addressed. - -.. MMU is not supported in CV32A65X - If the logical to physical mapping changes, a ``fence.vm`` instruction should be used to flush the pipeline *and TLBs (MMU is not enabled in CV32A6 v0.1.0)*. - - - -Fetch Stage -~~~~~~~~~~~ - -Fetch stage controls the CACHE module by a handshaking protocol. -Fetched data is a 32-bit block with a word-aligned address. -A granted fetch is processed by the instr_realign submodule to produce instructions. -Then instructions are pushed into an internal instruction FIFO called instruction queue (instr_queue submodule). -This submodule stores the instructions and sends them to the DECODE module. - -.. TO_BE_COMPLETED MMU also feedback an exception, but not present in 65X - -Memory can feedback potential exceptions which can be bus errors, invalid accesses or instruction page faults. -The FRONTEND transmits the exception from CACHES to DECODE. - - - -Submodules ----------- - -.. figure:: ../images/frontend_modules.png - :name: FRONTEND submodules - :align: center - - FRONTEND submodules - -.. figure:: ../images/ZoominFrontend.png - :name: frontend-schematic - :align: center - - FRONTEND submodule interconnections - -Instr_realign submodule -~~~~~~~~~~~~~~~~~~~~~~~ - -The 32-bit aligned block coming from the CACHE module enters the instr_realign submodule. -This submodule extracts the instructions from the 32-bit blocks. -It is possible to fetch up to two instructions per cycle when C extension is used. -An not-compressed instruction can be misaligned on the block size, interleaved with two cache blocks. -In that case, two cache accesses are needed to get the whole instruction. -The instr_realign submodule provides at maximum two instructions per cycle when compressed extensionis enabled, else one instruction per cycle. -Incomplete instruction is stored in instr_realign submodule until its second half is fetched. - -.. include:: port_instr_realign.rst - - -Instr_queue submodule -~~~~~~~~~~~~~~~~~~~~~ - -The instr_queue receives mutliple instructions from instr_realign submodule to create a valid stream of instructions to be decoded (by DECODE), to be issued (by ISSUE) and executed (by EXECUTE). -FRONTEND pushes in FIFO to store the instructions and related information needed in case of mispredict or exception: instructions, instruction control flow type, exception, exception address and predicted address. -DECODE pops them when decode stage is ready and indicates to the FRONTEND the instruction has been consummed. - -The instruction queue contains max 4 instructions. -If the instruction queue is full, a replay request is sent to inform the fetch mechanism to replay the fetch. - -The instruction queue can be flushed by CONTROLLER. - -.. include:: port_instr_queue.rst - - -instr_scan submodule -~~~~~~~~~~~~~~~~~~~~ - -As compressed extension is enabled, two instr_scan are instantiated to handle up to two instructions per cycle. - -Each instr_scan submodule pre-decodes the fetched instructions coming from the instr_realign module, instructions could be compressed or not. -The instr_scan submodule is a flox controler which provides the intruction type: branch, jump, return, jalr, imm, call or others. -These outputs are used by the branch prediction feature. - -.. include:: port_instr_scan.rst - - -BHT (Branch History Table) submodule -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - -BHT is implemented as a memory which is composed of **BHTDepth configuration parameter** entries. The lower address bits of the virtual address point to the memory entry. - -When a branch instruction is resolved by the EX_STAGE module, the branch PC and the taken (or not taken) status information is stored in the Branch History Table. - -.. TO_BE_COMPLETED: Specify the behaviour when BHT is saturated - -The Branch History Table is a table of two-bit saturating counters that takes the virtual address of the current fetched instruction by the CACHE. -It states whether the current branch request should be taken or not. -The two bit counter is updated by the successive execution of the instructions as shown in the following figure. - -.. figure:: ../images/bht.png - :name: BHT saturation - :align: center - :alt: - - BHT saturation - -.. TODO: if debug enable, The BHT is not updated if processor is in debug mode. - -When a branch instruction is pre-decoded by instr_scan submodule, the BHT valids whether the PC address is in the BHT and provides the taken or not prediction. - -The BHT is never flushed. - - -.. include:: port_bht.rst - -BTB (Branch Target Buffer) submodule -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - -BTB is implemented as an array which is composed of **BTBDepth configuration parameter** entries. -The lower address bits of the virtual address point to the memory entry. - -When an JALR instruction is found mispredicted by the EX_STAGE module, the JALR PC and the target address are stored into the BTB. - -.. TODO: Specify the behaviour when BTB is saturated - -.. TODO: when debug enabled, The BTB is not updated if processor is in debug mode. - -When a JALR instruction is pre-decoded by instr_scan submodule, the BTB informs whether the input PC address is in the BTB. -In this case, the BTB provides the predicted target address. - -The BTB is never flushed. - - -.. include:: port_btb.rst - - -RAS (Return Address Stack) submodule -~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - - -RAS is implemented as a LIFO which is composed of **RASDepth configuration parameter** entries. - -When a JAL instruction is pre-decoded by the instr_scan, the PC of the instruction following JAL instruction is pushed into the RAS when the JAL instruction is added to the instruction queue. - -When a JALR instruction which corresponds to a return (rs1 = x1 or rs1 = x5) is pre-decoded by the instr_scan, the predicted return address is popped from the RAS when the JALR instruction is added to the instruction queue. -If the predicted return address is wrong due for instance to speculation or RAS depth limitation, a mis-repdiction will be generated. - -The RAS is never flushed. - -.. include:: port_ras.rst - diff --git a/docs/04_cv32a65x/design/source/cv32a6_glossary.rst b/docs/04_cv32a65x/design/source/cv32a6_glossary.rst deleted file mode 100644 index 247cf315cd..0000000000 --- a/docs/04_cv32a65x/design/source/cv32a6_glossary.rst +++ /dev/null @@ -1,71 +0,0 @@ -.. - Copyright (c) 2020 OpenHW Group - - Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - https://solderpad.org/licenses/ - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - - SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0 - -.. _CV32A6_GLOSSARY: - -Glossary -======== - -* **ALU**: Arithmetic/Logic Unit -* **APU**: Application Processing Unit -* **ASIC**: Application-Specific Integrated Circuit -* **AXI**: Advanced eXtensible Interface -* **BHT**: Branch History Table -* **BTB**: Branch Target Buffer -* **Byte**: 8-bit data item -* **CPU**: Central Processing Unit, processor -* **CSR**: Control and Status Register -* **Custom extension**: Non-Standard extension to the RISC-V base instruction set (RISC-V Instruction Set Manual, Volume I: User-Level ISA) -* **CVA6**: Core-V Application class processor with a 6 stage pipeline -* **D$**: Data Cache -* **DPI**: Direct Programming Interface -* **EX** or **EXE**: Instruction Execute -* **FPGA**: Field Programmable Gate Array -* **FPU**: Floating Point Unit -* **Halfword**: 16-bit data item -* **Halfword aligned address**: An address is halfword aligned if it is divisible by 2 -* **I$**: Instruction Cache -* **ID**: Instruction Decode -* **IF**: Instruction Fetch -* **ISA**: Instruction Set Architecture -* **KGE**: Kilo Gate Equivalents (NAND2) -* **LSU**: Load Store Unit -* **M-Mode**: Machine Mode (RISC-V Instruction Set Manual, Volume II: Privileged Architecture) -* **MMU**: Memory Management Unit -* **NC**: Not Cacheable -* **OBI**: Open Bus Interface -* **OoO**: Out Of Order -* **PC**: Program Counter -* **PMP**: Physical memory protection (RISC-V Instruction Set Manual, Volume II: Privileged Architecture) -* **PTW**: Page Table Walker -* **PULP platform**: Parallel Ultra Low Power Platform () -* **RAS**: Return Address Stack -* **RV32C**: RISC-V Compressed (C extension) -* **RV32F**: RISC-V Floating Point (F extension) -* **S-Mode**: Supervisor Mode (RISC-V Instruction Set Manual, Volume II: Privileged Architecture) -* **SIMD**: Single Instruction/Multiple Data -* **Standard extension**: Standard extension to the RISC-V base instruction set (RISC-V Instruction Set Manual, Volume I: User-Level ISA) -* **TLB**: Translation Lookaside Buffer -* **U-Mode**: User Mode (RISC-V Instruction Set Manual, Volume II: Privileged Architecture) -* **VLEN**: Virtual address length -* **WARL**: Write Any Values, Reads Legal Values -* **WB**: Write Back of instruction results -* **WLRL**: Write/Read Only Legal Values -* **Word**: 32-bit data item -* **Word aligned address**: An address is word aligned if it is divisible by 4 -* **WPRI**: Reserved Writes Preserve Values, Reads Ignore Values -* **XLEN**: RISC-V processor data length diff --git a/docs/04_cv32a65x/design/source/cva6_commit_stage.rst b/docs/04_cv32a65x/design/source/cva6_commit_stage.rst deleted file mode 100644 index b4dde9e917..0000000000 --- a/docs/04_cv32a65x/design/source/cva6_commit_stage.rst +++ /dev/null @@ -1,36 +0,0 @@ -.. - Copyright 2024 Thales DIS France SAS - Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); - you may not use this file except in compliance with the License. - SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 - You may obtain a copy of the License at https://solderpad.org/licenses/ - - Original Author: Jean-Roch COULON - Thales - -.. _CVA6_COMMIT_STAGE: - -COMMIT_STAGE Module -=================== - -Description ------------ - -The COMMIT_STAGE module implements the commit stage, which is the last stage in the processor’s pipeline. -For the instructions for which the execution is completed, it updates the architectural state: writing CSR registers, committing stores and writing back data to the register file. -The commit stage controls the stalling and the flushing of the processor. - -The commit stage also manages the exceptions. -An exception can occur during the first four pipeline stages (PCgen cannot generate an exception) or happen in commit stage, coming from the CSR_REGFILE or from an interrupt. -Exceptions are precise: they are considered during the commit only and associated with the related instruction. - -The module is connected to: - -* TO BE COMPLETED - -.. include:: port_commit_stage.rst - -Functionality -------------- - -TO BE COMPLETED - diff --git a/docs/04_cv32a65x/design/source/cva6_id_stage.rst b/docs/04_cv32a65x/design/source/cva6_id_stage.rst deleted file mode 100644 index faa3c579ea..0000000000 --- a/docs/04_cv32a65x/design/source/cva6_id_stage.rst +++ /dev/null @@ -1,79 +0,0 @@ -.. - Copyright 2024 Thales DIS France SAS - Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); - you may not use this file except in compliance with the License. - SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 - You may obtain a copy of the License at https://solderpad.org/licenses/ - - Original Author: Jean-Roch COULON - Thales - -.. _CVA6_ID_STAGE: - -ID_STAGE Module -=============== - -Description ------------ - -The ID_STAGE module implements the decode stage of the pipeline. -Its main purpose is to decode RISC-V instructions coming from FRONTEND module -(fetch stage) and send them to the ISSUE_STAGE module (issue stage). - -The compressed_decoder module checks whether the incoming instruction is -compressed and output the corresponding uncompressed instruction. -Then the decoder module decodes the instruction and send it to the -issue stage. - - -The module is connected to: - -* CONTROLLER module can flush ID_STAGE decode stage -* FRONTEND module sends instrution to ID_STAGE module -* ISSUE module receives the decoded instruction from ID_STAGE module -* CSR_REGFILE module sends status information about privilege mode, traps, extension support. - -.. include:: port_id_stage.rst - - - -Functionality -------------- - -TO BE COMPLETED - - -Submodules ----------- - -.. figure:: ../images/id_stage_modules.png - :name: ID_STAGE submodules - :align: center - :alt: - - ID_STAGE submodules - - -Compressed_decoder -~~~~~~~~~~~~~~~~~~ - -The compressed_decoder module decompresses all the compressed -instructions taking a 16-bit compressed instruction and expanding it -to its 32-bit equivalent. -All compressed instructions have a 32-bit equivalent. - -.. include:: port_compressed_decoder.rst - -Decoder -~~~~~~~ - -The decoder module takes the output of compressed_decoder module and decodes it. -It transforms the instruction to the most fundamental control structure in pipeline, a scoreboard entry. - -The scoreboard entry contains an exception entry which is composed of a valid field, a cause and a value called TVAL. -As TVALEn configuration parameter is zero, the TVAL field is not implemented. - -A potential illegal instruction exception can be detected during decoding. -If no exception has happened previously in fetch stage, the decoder will valid the exception and add the cause and tval value to the scoreboard entry. - -.. include:: port_decoder.rst - diff --git a/docs/04_cv32a65x/design/source/cva6_issue_stage.rst b/docs/04_cv32a65x/design/source/cva6_issue_stage.rst deleted file mode 100644 index a0d542d763..0000000000 --- a/docs/04_cv32a65x/design/source/cva6_issue_stage.rst +++ /dev/null @@ -1,64 +0,0 @@ -.. - Copyright 2024 Thales DIS France SAS - Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); - you may not use this file except in compliance with the License. - SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 - You may obtain a copy of the License at https://solderpad.org/licenses/ - - Original Author: Jean-Roch COULON - Thales - -.. _CVA6_ISSUE_STAGE: - -ISSUE_STAGE Module -================== - -Description ------------ - -The execution can be roughly divided into four parts: issue(1), read operands(2), execute(3) and write-back(4). -The ISSUE_STAGE module handles step one, two and four. -The ISSUE_STAGE module receives the decoded instructions and issues them to the various functional units. - -A data structure called scoreboard is used to keep track of data related to the issue instruction: which functional unit and which destination register they are. -The scoreboard handle the write-back data received from the COMMIT_STAGE module. - -Furthermore it contains the CPU’s register file. - - -The module is connected to: - -* TO BE COMPLETED - -.. include:: port_issue_stage.rst - -Functionality -------------- - -TO BE COMPLETED - - -Submodules ----------- - -.. figure:: ../images/issue_stage_modules.png - :name: ISSUE_STAGE submodules - :align: center - :alt: - - ISSUE_STAGE submodules - -Scoreboard -~~~~~~~~~~ - -The scoreboard contains a FIFO to store the decoded instructions. -Issued instruction is pushed to the FIFO if it is not full. -It indicates which registers are going to be clobbered by a previously issued instruction. - -.. include:: port_scoreboard.rst - -Issue_read_operands -~~~~~~~~~~~~~~~~~~~ - -TO BE COMPLETED - -.. include:: port_issue_read_operands.rst diff --git a/docs/04_cv32a65x/design/images/CV32A65X_subsystems.png b/docs/04_cv32a65x/design/source/images/CV32A65X_subsystems.png similarity index 100% rename from docs/04_cv32a65x/design/images/CV32A65X_subsystems.png rename to docs/04_cv32a65x/design/source/images/CV32A65X_subsystems.png diff --git a/docs/04_cv32a65x/design/source/index.rst b/docs/04_cv32a65x/design/source/index.rst deleted file mode 100644 index f4efb056b4..0000000000 --- a/docs/04_cv32a65x/design/source/index.rst +++ /dev/null @@ -1,23 +0,0 @@ -.. - Copyright (c) 2022 Thales - Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0 - You may obtain a copy of the License at https://solderpad.org/licenses/ - - Original Author: Jean-Roch COULON - Thales - - -Design Document -=============== -Editor: **Jean Roch Coulon** - -.. toctree:: - :maxdepth: 4 - :caption: Contents: - - intro - subsystem - functionality - architecture - cv32a6_glossary diff --git a/docs/04_cv32a65x/design/source/instructions.rst b/docs/04_cv32a65x/design/source/instructions.rst deleted file mode 100644 index c4efe68443..0000000000 --- a/docs/04_cv32a65x/design/source/instructions.rst +++ /dev/null @@ -1,31 +0,0 @@ -.. - Copyright 2023 Thales DIS France SAS - Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0 - You may obtain a copy of the License at https://solderpad.org/licenses/ - - Original Author: Jean-Roch COULON - Thales - -Instructions -============ - -The next first subchapter lists the extensions implemented in CVA6. -By configuration, we can enable/disable the extensions. -CV32A65X supports the extensions described in the next subchapters. -RVZicond, RV32A and RVZifencei extensions are not supported by CV32A65X. - - -.. toctree:: - :maxdepth: 1 - - ../../../01_cva6_user/RISCV_Instructions - ../../../01_cva6_user/RISCV_Instructions_RV32I - ../../../01_cva6_user/RISCV_Instructions_RV32M - ../../../01_cva6_user/RISCV_Instructions_RV32C - ../../../01_cva6_user/RISCV_Instructions_RV32ZCb - ../../../01_cva6_user/RISCV_Instructions_RVZba - ../../../01_cva6_user/RISCV_Instructions_RVZbb - ../../../01_cva6_user/RISCV_Instructions_RVZbc - ../../../01_cva6_user/RISCV_Instructions_RVZbs - ../../../01_cva6_user/RISCV_Instructions_RVZicsr diff --git a/docs/04_cv32a65x/design/source/intro.rst b/docs/04_cv32a65x/design/source/intro.rst deleted file mode 100644 index 20e808b01f..0000000000 --- a/docs/04_cv32a65x/design/source/intro.rst +++ /dev/null @@ -1,95 +0,0 @@ -.. - Copyright 2022 Thales DIS design services SAS - Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0 - You may obtain a copy of the License at https://solderpad.org/licenses/ - - Original Author: Jean-Roch COULON - Thales - - - -Introduction -============ - -The OpenHW Group uses `semantic versioning `_ to describe the release status of its IP. -This document describes the CV32A65X configuration version of CVA6. -This intends to be the first formal release of CVA6. - -CVA6 is a 6-stage in-order and single issue processor core which implements the RISC-V instruction set. -CVA6 can be configured as a 32- or 64-bit core (RV32 or RV64), called CV32A6 or CV64A6. - -The objective of this document is to provide enough information to allow the RTL modification (by designers) and the RTL verification (by verificators). -This document is not dedicated to CVA6 users looking for information to develop software like instructions or registers. - -The CVA6 architecture is illustrated in the following figure. - -.. figure:: ../images/ariane_overview.drawio.png - :name: CVA6 Architecute - :align: center - :alt: - - CVA6 Architecture - - -License -------- - -| Copyright 2022 Thales -| Copyright 2018 ETH Zürich and University of Bologna -| SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 -| Licensed under the Solderpad Hardware License v 2.1 (the “License”); - you may not use this file except in compliance with the License, or, - at your option, the Apache License version 2.0. You may obtain a copy - of the License at https://solderpad.org/licenses/SHL-2.1/. -| Unless required by applicable law or agreed to in writing, any work - distributed under the License is distributed on an “AS IS” BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied. See the License for the specific language governing - permissions and limitations under the License. - - -Standards Compliance --------------------- - -To ease the reading, the reference to these specifications can be implicit in the requirements below. For the sake of precision, the requirements identify the versions of RISC-V extensions from these specifications. - -* **[CVA6req]** “CVA6 requirement specification”, https://github.com/openhwgroup/cva6/blob/master/docs/specifications/cva6_requirement_specification.rst, HASH#767c465. -* **[RVunpriv]** “The RISC-V Instruction Set Manual, Volume I: User-Level ISA, Document Version 20191213”, Editors Andrew Waterman and Krste Asanović, RISC-V Foundation, December 13, 2019. -* **[RVpriv]** “The RISC-V Instruction Set Manual, Volume II: Privileged Architecture, Document Version 20211203”, Editors Andrew Waterman, Krste Asanović and John Hauser, RISC-V Foundation, December 4, 2021. -* **[RVdbg]** “RISC-V External Debug Support, Document Version 0.13.2”, Editors Tim Newsome and Megan Wachs, RISC-V Foundation, March 22, 2019. -* **[RVcompat]** “RISC-V Architectural Compatibility Test Framework”, https://github.com/riscv-non-isa/riscv-arch-test. -* **[AXI]** AXI Specification, https://developer.arm.com/documentation/ihi0022/hc. -* **[CV-X-IF]** Placeholder for the CV-X-IF coprocessor interface currently prepared at OpenHW Group; current version in https://docs.openhwgroup.org/projects/openhw-group-core-v-xif/. -* **[OpenPiton]** “OpenPiton Microarchitecture Specification”, Princeton University, https://parallel.princeton.edu/openpiton/docs/micro_arch.pdf. - -CV32A6 is a standards-compliant 32-bit processor fully compliant with RISC-V specifications: [RVunpriv], [RVpriv] and [RVdbg] and passes [RVcompat] compatibility tests, as requested by [GEN-10] in [CVA6req]. - - -Documentation framework ------------------------ - -The framework of this document is inspired by the Common Criteria. The Common Criteria for Information Technology Security Evaluation (referred to as Common Criteria or CC) is an international standard (ISO/IEC 15408) for computer security certification. - -Description of the framework: - -* Processor is split into module corresponding to the main modules of the design -* Modules can contain several modules -* Each module is described in a chapter, which contains the following subchapters: *Description*, *Functionalities*, *Architecture and Modules* and *Registers* (if any) -* The subchapter *Description* describes the main features of the submodule, the interconnections between the current module and the others and the inputs/outputs interface. -* The subchapter *Functionality* lists in details the module functionalities. Please avoid using the RTL signal names to explain the functionalities. -* The subchapter *Architecture and Modules* provides a drawing to present the module hierarchy, then the functionalities covered by the module -* The subchapter *Registers* specifies the module registers if any - - -Contributors ------------- - -| Jean-Roch Coulon - Thales -| Ayoub Jalali - (`ayoub.jalali@external.thalesgroup.com `__) -| Alae Eddine Ezzejjari - (`alae-eddine.ez-zejjari@external.thalesgroup.com `__) - -[TO BE COMPLETED] - diff --git a/docs/04_cv32a65x/design/source/mmu.rst b/docs/04_cv32a65x/design/source/mmu.rst deleted file mode 100644 index 1f5493f0e9..0000000000 --- a/docs/04_cv32a65x/design/source/mmu.rst +++ /dev/null @@ -1,1587 +0,0 @@ -.. _CVA6_MMU: - - ----------------------- -Memory Management Unit ----------------------- - -The Memory Management Unit (MMU) SV32 module is a crucial component in the RISC-V-based processor, serving as the backbone for virtual memory management and address translation. - -.. figure:: ../images/mmu_in_out.png - :name: **Figure 1:** Inputs and Outputs of CVA6 MMU SV32 - :align: center - :width: 70% - :alt: mmu_in_out - - **Figure 1:** Inputs and Outputs of CVA6 MMU SV32 - -At its core, the MMU SV32 plays a pivotal role in translating virtual addresses into their corresponding physical counterparts. -This translation process is paramount for providing memory protection, isolation, and efficient memory management in modern computer systems. -Importantly, it handles both instruction and data accesses, ensuring a seamless interaction between the processor and virtual memory. -Within the MMU, several major blocks play pivotal roles in this address translation process. These includes: - -* Instruction TLB (ITLB) -* Data TLB (DTLB) -* Shared TLB -* Page Table Walker (PTW) - -.. figure:: ../images/mmu_major_blocks.png - :name: **Figure 2:** Major Blocks in CVA6 MMU SV32 - :align: center - :width: 60% - :alt: mmu_major_blocks - - **Figure 2:** Major Blocks in CVA6 MMU SV32 - -The MMU SV32 manages privilege levels and access control, enforcing permissions for user and supervisor modes while handling access exceptions. -It employs Translation Lookaside Buffers (TLBs) for efficient address translation, reducing the need for page table access. -TLB hits yield quick translations, but on misses, the shared TLB is consulted, and if necessary, the Page Table Walker (PTW) performs page table walks, updating TLBs and managing exceptions during the process. - -In addition to these functionalities, the MMU SV32 seamlessly integrates support for Physical Memory Protection (PMP), enabling it to enforce access permissions and memory protection configurations as specified by the PMP settings. -This additional layer of security and control enhances the management of memory accesses - -.. raw:: html - - Instruction and Data Interfaces - -The MMU SV32 maintains interfaces with the instruction cache (ICache) and the load-store unit (LSU). -It receives virtual addresses from these components and proceeds to translate them into physical addresses, a fundamental task for ensuring proper program execution and memory access. - -.. raw:: html - - Signal Description of MMU - -.. raw:: html - -

Table 1: CVA6 MMU SV32 Input Output Signals

- -.. list-table:: - :header-rows: 1 - - * - Signal - - IO - - Connection Type - - Type - - Description - - * - ``clk_i`` - - in - - Subsystem - - logic - - Subsystem Clock - - * - ``rst_ni`` - - in - - Subsystem - - logic - - Asynchronous reset active low - - * - ``flush_i`` - - in - - Controller - - logic - - Sfence Committed - - * - ``enable_translation_i`` - - in - - CSR RegFile - - logic - - Indicate address translation request for instruction - - * - ``en_ld_st_translation_i`` - - in - - CSR RegFile - - logic - - Indicate address translation request for load or store - - * - ``icache_areq_i`` - - in - - Cache Subsystem - - icache_arsp_t - - Icache Response - - * - ``icache_areq_o`` - - out - - Cache Subsystem - - icache_areq_t - - Icache Request - - * - ``misaligned_ex_i`` - - in - - Load Store Unit - - exception_t - - Indicate misaligned exception - - * - ``lsu_req_i`` - - in - - Load Store Unit - - logic - - Request address translation - - * - ``lsu_vaddr_i`` - - in - - Load Store Unit - - logic [riscv::VLEN-1:0] - - Virtual Address In - - * - ``lsu_is_store_i`` - - in - - Store Unit - - logic - - Translation is requested by a store - - * - ``lsu_dtlb_hit_o`` - - out - - Store / Load Unit - - logic - - Indicate a DTLB hit - - * - ``lsu_dtlb_ppn_o`` - - out - - Load Unit - - logic [riscv::PPNW-1:0] - - Send PNN to LSU - - * - ``lsu_valid_o`` - - out - - Load Store Unit - - logic - - Indicate a valid translation - - * - ``lsu_paddr_o`` - - out - - Store / Load Unit - - logic [riscv::PLEN-1:0] - - Translated Address - - * - ``lsu_exception_o`` - - out - - Store / Load Unit - - exception_t - - Address Translation threw an exception - - * - ``priv_lvl_i`` - - in - - CSR RegFile - - riscv::priv_lvl_t - - Privilege level for instruction fetch interface - - * - ``ld_st_priv_lvl_i`` - - in - - CSR RegFile - - riscv::priv_lvl_t - - Privilege Level for Data Interface - - * - ``sum_i`` - - in - - CSR RegFile - - logic - - Supervisor User Memory Access bit in xSTATUS CSR register - - * - ``mxr_i`` - - in - - CSR RegFile - - logic - - Make Executable Readable bit in xSTATUS CSR register - - * - ``satp_ppn_I`` - - in - - CSR RegFile - - logic [riscv::PPNW-1:0] - - PPN of top level page table from SATP register - - * - ``asid_i`` - - in - - CSR RegFile - - logic [ASID_WIDTH-1:0] - - ASID to for the lookup - - * - ``asid_to_be_flushed`` - - in - - Execute Stage - - logic [ASID_WIDTH-1:0] - - ASID of the entry to be flushed. - - * - ``vaddr_to_be_flushed_i`` - - in - - Execute Stage - - logic [riscv::VLEN-1:0] - - Virtual address of the entry to be flushed. - - * - ``flush_tlb_i`` - - in - - Controller - - logic - - SFENCE.VMA committed - - * - ``itlb_miss_o`` - - out - - Performance Counter - - logic - - Indicate an ITLB miss - - * - ``dtlb_miss_o`` - - out - - Performance Counter - - logic - - Indicate a DTLB miss - - * - ``req_port_i`` - - in - - Cache Subsystem - - dcache_req_o_t - - D Cache Data Requests - - * - ``req_port_o`` - - out - - Cache Subsystem - - dcache_req_i_t - - D Cache Data Response - - * - ``pmpcfg_i`` - - in - - CSR RegFile - - riscv::pmpcfg_t [15:0] - - PMP configurations - - * - ``pmpaddr_i`` - - in - - CSR RegFile - - logic [15:0][riscv::PLEN-3:0] - - PMP Address - -.. raw:: html - - Struct Description - -.. raw:: html - -

Table 2: I Cache Request Struct (icache_areq_t)

- -.. list-table:: - :header-rows: 1 - - * - Signal - - Type - - Description - - * - ``fetch_valid`` - - logic - - Address Translation Valid - - * - ``fetch_paddr`` - - logic [riscv::PLEN-1:0] - - Physical Address In - - * - ``fetch_exception`` - - exception_t - - Exception occurred during fetch - -.. raw:: html - -

Table 3: I Cache Response Struct (icache_arsq_t)

- -.. list-table:: - :header-rows: 1 - - * - Signal - - Type - - Description - - * - ``fetch_req`` - - logic - - Address Translation Request - - * - ``fetch_vaddr`` - - logic [riscv::VLEN-1:0] - - Virtual Address out - -.. raw:: html - -

Table 4: Exception Struct (exception_t)

- -.. list-table:: - :header-rows: 1 - - * - Signal - - Type - - Description - - * - ``cause`` - - riscv::xlen_t - - Cause of exception - - * - ``tval`` - - riscv::xlen_t - - Additional information of causing exception (e.g. instruction causing it), address of LD/ST fault - - * - ``valid`` - - logic - - Indicate that exception is valid - -.. raw:: html - -

Table 5: PMP Configuration Struct (pmpcfg_t)

- -.. list-table:: - :header-rows: 1 - - * - Signal - - Type - - Description - - * - ``locked`` - - logic - - Lock this configuration - - * - ``reserved`` - - logic[1:0] - - Reserved bits in pmpcfg CSR - - * - ``addr_mode`` - - pmp_addr_mode_t - - Addressing Modes: OFF, TOR, NA4, NAPOT - - * - ``access_type`` - - pmpcfg_access_t - - None, read, write, execute - -.. raw:: html - - Control Flow in MMU SV32 Module - -.. figure:: ../images/mmu_control_flow.png - :name: **Figure 3:** Control Flow in CVA6 MMU SV32 - :align: center - :width: 95% - :alt: mmu_control_flow - - **Figure 3:** Control Flow in CVA6 MMU SV32 - -.. raw:: html - - Exception Sources with Address Translation Enabled - -Two potential exception sources exist: - -* Hardware Page Table Walker (HPTW) throwing an exception, signifying a page fault exception. -* Access error due to insufficient permissions of PMP, known as an access exception. - -.. raw:: html - - Instruction Fetch Interface - -The IF stage initiates a request to retrieve memory content at a specific virtual address. When the MMU is disabled, the instruction fetch request is directly passed to the I$ without modifications. - -.. raw:: html - - Address Translation in Instruction Interface - -If virtual memory translation is enabled for instruction fetches, the following operations are performed in the instruction interface: - -* Compatibility of requested virtual address with selected page based address translation scheme is checked. -* For 4K page translation, the module determines the fetch physical address by combining the physical page number (PPN) from ITLB content and the offset from the virtual address. -* In the case of Mega page translation, if the ITLB indicates a 4M page, the VPN0 from the fetch virtual address is written to the PPN0 of the fetch physical address to ensure alignment for superpage translation. -* If the Instruction TLB (ITLB) lookup hits, the fetch valid signal (which indicates a valid physical address) is activated in response to the input fetch request. Memory region accessibility is checked from the perspective of the fetch operation, potentially triggering a page fault exception in case of an access error or insufficient PMP permission. -* In case of an ITLB miss, if the page table walker (PTW) is active (only active if there is a shared TLB miss) and handling instruction fetches, the fetch valid signal is determined based on PTW errors or access exceptions. - -If the fetch physical address doesn't match any execute region, an Instruction Access Fault is raised. When not translating, PMPs are immediately checked against the physical address for access verification. - -.. raw:: html - - Data Interface - -.. raw:: html - - Address Translation in Data Interface - -If address translation is enabled for load or store, and no misaligned exception has occurred, the following operations are performed in the data interface: - -* Initially, translation is assumed to be invalid, signified by the MMU to LSU. -* The translated physical address is formed by combining the PPN from the Page Table Entry (PTE) and the offset from the virtual address requiring translation. This send one cycle later due to the additional bank of registers which delayed the MMU’s answer. The PPN from the PTE is also shared separately with LSU in the same cycle as the hit. -* In the case of superpage translation, as in SV32, known as the 4M page, PPN0 of the translated physical address and the separately shared PPN are updated with the VPN0 of the virtual address. - -If a Data TLB (DTLB) hit occurs, it indicates a valid translation, and various fault checks are performed depending on whether it's a load or store request. - -* For store requests, if the page is not writable, the dirty flag isn't set, or privileges are violated, it results in a page fault corresponding to the store access. If PMPs are also violated, it leads to an access fault corresponding to the store access. Page faults take precedence over access faults. -* For load requests, a page fault is triggered if there are insufficient access privileges. PMPs are checked again during load access, resulting in an access fault corresponding to load access if PMPs are violated. - -In case of a DTLB miss, potential exceptions are monitored during the page table walk. If the PTW indicates a page fault, the corresponding page fault related to the requested type is signaled. If the PTW indicates an access exception, the load access fault is indicated through address translation because the page table walker can only throw load access faults. - -.. raw:: html - - Address Translation is Disabled - -When address translation is not enabled, the physical address is immediately checked against Physical Memory Protections (PMPs). If there is a request from LSU, no misaligned exception, and PMPs are violated, it results in an access fault corresponding to the request being indicated. - ----------------------------- -Translation Lookaside Buffer ----------------------------- - -Page tables are accessed for translating virtual memory addresses to physical memory addresses. This translation needs to be carried out for every load and store instruction and also for every instruction fetch. Since page tables are resident in physical memory, accessing these tables in all these situations has a significant impact on performance. Page table accesses occur in patterns that are closely related in time. Furthermore, the spatial and temporal locality of data accesses or instruction fetches mean that the same page is referenced repeatedly. Taking advantage of these access patterns the processor keeps the information of recent address translations, to enable fast retrieval, in a small cache called the Translation Lookaside Buffer (TLB) or an address-translation cache. - -The CVA6 TLB is structured as a fully associative cache, where the virtual address that needs to be translated is compared against all the individual TLB entries. Given a virtual address, the processor examines the TLB (TLB lookup) to determine if the virtual page number (VPN) of the page being accessed is in the TLB. When a TLB entry is found (TLB hit), the TLB returns the corresponding physical page number (PPN) which is used to calculate the target physical address. If no TLB entry is found (TLB miss) the processor has to read individual page table entries from memory (Table walk). In CVA6 table walking is supported by dedicated hardware. Once the processor finishes the table walk it has the Physical Page Number (PPN) corresponding to the Virtual Page Number (VPN) That needs to be translated. The processor adds an entry for this address translation to the TLB so future translations of that virtual address will happen quickly through the TLB. During the table walk the processor may find out that the corresponding physical page is not resident in memory. At this stage a page table exception (Page Fault) is generated which gets handled by the operating system. The operating system places the appropriate page in memory, updates the appropriate page tables and returns execution to the instruction which generated the exception. - -The inputs and output signals of the TLB are shown in the following two figures. - -.. figure:: ../images/in_out_tlb.png - :name: **Figure 4:** Inputs and Outputs of CVA6 TLB - :align: center - :width: 65% - :alt: in_out_tlb - - **Figure 4:** Inputs and Outputs of CVA6 TLB - -.. raw:: html - - Signal Description of TLB - -.. raw:: html - -

Table 6: CVA6 TLB Input Output Signals

- -.. list-table:: - :header-rows: 1 - - * - Signal - - IO - - connection - - Type - - Description - - * - ``clk_i`` - - in - - SUBSYSTEM - - logic - - Subsystem Clock - - * - ``rst_ni`` - - in - - SUBSYSTEM - - logic - - Asynchronous reset active low - - * - ``flush_i`` - - in - - Controller - - logic - - Asynchronous reset active low - - * - ``update_i`` - - in - - Shared TLB - - tlb_update_sv32_t - - Updated tag and content of TLB - - * - ``lu_access_i`` - - in - - Cache Subsystem - - logic - - Signal indicating a lookup access is being requested - - * - ``lu_asid_i`` - - in - - CSR RegFile - - logic[ASID_WIDTH-1:0] - - ASID (Address Space Identifier) for the lookup - - * - ``lu_vaddr_i`` - - in - - Cache Subsystem - - logic[riscv::VLEN-1:0] - - Virtual address for the lookup - - * - ``lu_content_o`` - - out - - MMU SV32 - - riscv::pte_sv32_t - - Output for the content of the TLB entry - - * - ``asid_to_be_flushed_i`` - - in - - Execute Stage - - logic[ASID_WIDTH-1:0] - - ASID of the entry to be flushed - - * - ``vaddr_to_be_flushed_i`` - - in - - Execute Stage - - logic[riscv::VLEN-1:0] - - Virtual address of the entry to be flushed - - * - ``lu_is_4M_o`` - - out - - MMU SV32 - - logic - - Output indicating whether the TLB entry corresponds to a 4MB page - - * - ``lu_hit_o`` - - out - - MMU SV32 - - logic - - Output indicating whether the lookup resulted in a hit or miss - -.. raw:: html - - Struct Description - -.. raw:: html - -

Table 7: SV32 TLB Update Struct (tlb_update_sv32_t)

- -.. list-table:: - :header-rows: 1 - - * - Signal - - Type - - Description - - * - ``valid`` - - logic - - Indicates whether the TLB update entry is valid or not - - * - ``is_4M`` - - logic - - Indicates if the TLB entry corresponds to a 4MB page - - * - ``vpn`` - - logic[19:0] - - Virtual Page Number (VPN) used for updating the TLB, consisting of 20 bits - - * - ``asid`` - - logic[8:0] - - Address Space Identifier (ASID) used for updating the TLB, with a length of 9 bits for Sv32 MMU - - * - ``content`` - - riscv::pte_sv32_t - - Content of the TLB update entry, defined by the structure - -.. raw:: html - -

Table 8: SV32 PTE Struct (riscv::pte_sv32_t)

- -.. list-table:: - :header-rows: 1 - - * - Signal - - Type - - Description - - * - ``ppn`` - - logic[21:0] - - 22 bit Physical Page Number (PPN) - - * - ``rsw`` - - logic[1:0] - - Reserved for use by supervisor software - - * - ``d`` - - logic - - | Dirty bit indicating whether the page has been modified (dirty) or not - | 0: Page is clean i.e., has not been written - | 1: Page is dirty i.e., has been written - - * - ``a`` - - logic - - | Accessed bit indicating whether the page has been accessed - | 0: Virtual page has not been accessed since the last time A bit was cleared - | 1: Virtual page has been read, written, or fetched from since the last time the A bit was cleared - - * - ``g`` - - logic - - | Global bit marking a page as part of a global address space valid for all ASIDs - | 0: Translation is valid for specific ASID - | 1: Translation is valid for all ASIDs - - * - ``u`` - - logic - - | User bit indicating privilege level of the page - | 0: Page is not accessible in user mode but in supervisor mode - | 1: Page is accessible in user mode but not in supervisor mode - - * - ``x`` - - logic - - | Execute bit which allows execution of code from the page - | 0: Code execution is not allowed - | 1: Code execution is permitted - - * - ``w`` - - logic - - | Write bit allows the page to be written - | 0: Write operations are not allowed - | 1: Write operations are permitted - - * - ``r`` - - logic - - | Read bit allows read access to the page - | 0: Read operations are not allowed - | 1: Read operations are permitted - - * - ``v`` - - logic - - | Valid bit indicating the page table entry is valid - | 0: Page is invalid i.e. page is not in DRAM, translation is not valid - | 1: Page is valid i.e. page resides in the DRAM, translation is valid - -.. raw:: html - - TLB Entry Fields - -The number of TLB entries can be changed via a design parameter. In 32-bit configurations of CVA6 only 2 TLB entries are instantiated. Each TLB entry is made up of two fields: Tag and Content. The Tag field holds the virtual page number (VPN1, VPN0), ASID, page size (is_4M) along with a valid bit (VALID) indicating that the entry is valid. The SV32 virtual page number, which is supported by CV32A6X, is further split into two separate virtual page numbers VPN1 and VPN0. The Content field contains two physical page numbers (PPN1, PPN0) along with a number of bits which specify various attributes of the physical page. Note that the V bit in the Content field is the V bit which is present in the page table in memory. It is copied from the page table, as is, and the VALID bit in the Tag is set based on its value.The TLB entry fields are shown in **Figure 2**. - -.. figure:: ../images/cva6_tlb_entry.png - :name: **Figure 5:** Fields in CVA6 TLB entry - :align: center - :width: 80% - :alt: cva6_tlb_entry - - **Figure 5:** Fields in CVA6 TLB entry - -.. raw:: html - - CVA6 TLB Management / Implementation - -The CVA6 TLB implements the following three functions: - -* **Translation:** This function implements the address lookup and match logic. -* **Update and Flush:** This function implements the update and flush logic. -* **Pseudo Least Recently Used Replacement Policy:** This function implements the replacement policy for TLB entries. - -.. raw:: html - - Translation - -This function takes in the virtual address and certain other fields, examines the TLB to determine if the virtual page number of the page being accessed is in the TLB or not. If a TLB entry is found (TLB hit), the TLB returns the corresponding physical page number (PPN) which is then used to calculate the target physical address. The following checks are done as part of this lookup function to find a match in the TLB: - -* **Validity Check:** For a TLB hit, the associated TLB entry must be valid . -* **ASID and Global Flag Check:** The TLB entry's ASID must match the given ASID (ASID associated with the Virtual address). If the TLB entry’s Global bit (G) bit is set then this check is not done. This ensures that the translation is either specific to the provided ASID or it is globally applicable. -* **Level 1 VPN match:** SV32 implements a two-level page table. As such the virtual address is broken up into three parts which are the virtual page number 1, virtual page number 0 and displacement. So the condition that is checked next is that the virtual page number 1 of the virtual address matches the virtual page number 1(VPN1) of the TLB entry. -* **Level 0 VPN match or 4-Mega Page:** The last condition to be checked, for a TLB hit, is that the virtual page number 0 of the virtual address matches the virtual page number 0 of the TLB entry (VPN0). This match is ignored if the is_4M bit in the Tag is set which implies a super 4M page. - -All the conditions listed above are checked against every TLB entry. If there is a TLB hit then the corresponding bit in the hit array is set. **Figure 3** Illustrates the TLB hit/miss process listed above. - -.. figure:: ../images/cva6_tlb_hit.png - :name: **Figure 6:** Block diagram of CVA6 TLB hit or miss - :align: center - :width: 75% - :alt: cva6_tlb_hit - - **Figure 6:** Block diagram of CVA6 TLB hit or miss - -.. raw:: html - - Flushing TLB entries - -The SFENCE.VMA instruction can be used with certain specific source register specifiers (rs1 & rs2) to flush a specific TLB entry, some set of TLB entries or all TLB entries. Like all instructions this action only takes place when the SFENCE.VMA instruction is committed (shown via the commit_sfence signal in the following figures.) The behavior of the instruction is as follows: - -* **If rs1 is not equal to x0 and rs2 is not equal to x0:** Invalidate all TLB entries which contain leaf page table entries corresponding to the virtual address in rs1 (shown below as Virtual Address to be flushed) and that match the address space identifier as specified by integer register rs2 (shown below as asid_to_be_flushed_i), except for entries containing global mappings. This is referred to as the “SFENCE.VMA vaddr asid” case. - -.. figure:: ../images/sfence_vaddr_asid.png - :name: **Figure 7:** Invalidate TLB entry if ASID and virtual address match - :align: center - :width: 75% - :alt: sfence_vaddr_asid - - **Figure 7:** Invalidate TLB entry if ASID and virtual address match - -* **If rs1 is equal to x0 and rs2 is equal to x0:** Invalidate all TLB entries for all address spaces. This is referred to as the "SFENCE.VMA x0 x0" case. - -.. figure:: ../images/sfence_x0_x0.png - :name: **Figure 8:** Invalidate all TLB entries if both source register specifiers are x0 - :align: center - :width: 62% - :alt: sfence_x0_x0 - - **Figure 8:** Invalidate all TLB entries if both source register specifiers are x0 - -* **If rs1 is not equal to x0 and rs2 is equal to x0:** invalidate all TLB entries that contain leaf page table entries corresponding to the virtual address in rs1, for all address spaces. This is referred to as the “SFENCE.VMA vaddr x0” case. - -.. figure:: ../images/sfence_vaddr_x0.png - :name: **Figure 9:** Invalidate TLB entry with matching virtual address for all address spaces - :align: center - :width: 75% - :alt: sfence_vaddr_x0 - - **Figure 9:** Invalidate TLB entry with matching virtual address for all address spaces - -* **If rs1 is equal to x0 and rs2 is not equal to x0:** Invalidate all TLB entries matching the address space identified by integer register rs2, except for entries containing global mappings. This is referred to as the “SFENCE.VMA 0 asid” case. - -.. figure:: ../images/sfence_x0_asid.png - :name: **Figure 10:** Invalidate TLB entry for matching ASIDs - :align: center - :width: 75% - :alt: sfence_x0_asid - - **Figure 10:** Invalidate TLB entry for matching ASIDs - -.. raw:: html - - Updating TLB - -When a TLB valid update request is signaled by the shared TLB, and the replacement policy select the update of a specific TLB entry, the corresponding entry's tag is updated with the new tag, and its associated content is refreshed with the information from the update request. This ensures that the TLB entry accurately reflects the new translation information. - -.. raw:: html - - Pseudo Least Recently Used Replacement Policy - -Cache replacement algorithms are used to determine which TLB entry should be replaced, because it is not likely to be used in the near future. The Pseudo-Least-Recently-Used (PLRU) is a cache entry replacement algorithm, derived from Least-Recently-Used (LRU) cache entry replacement algorithm, used by the TLB. Instead of precisely tracking recent usage as the LRU algorithm does, PLRU employs an approximate measure to determine which entry in the cache has not been recently used and as such can be replaced. - -CVA6 implements the PLRU algorithm via the Tree-PLRU method which implements a binary tree. The TLB entries are the leaf nodes of the tree. Each internal node, of the tree, consists of a single bit, referred to as the state bit or plru bit, indicating which subtree contains the (pseudo) least recently used entry (the PLRU); 0 for the left hand tree and 1 for the right hand tree. Following this traversal, the leaf node reached, corresponds to the PLRU entry which can be replaced. Having accessed an entry (so as to replace it) we need to promote that entry to be the Most Recently Used (MRU) entry. This is done by updating the value of each node along the access path to point away from that entry. If the accessed entry is a right child i.e., its parent node value is 1, it is set to 0, and if the parent is the left child of its parent (the grandparent of the accessed node) then its node value is set to 1 and so on all the way up to the root node. - -The PLRU binary tree is implemented as an array of node values. Nodes are organized in the array based on levels, with those from lower levels appearing before higher ones. Furthermore those on the left side of a node appear before those on the right side of a node. The figure below shows a tree and the corresponding array. - -.. figure:: ../images/plru_tree_indexing.png - :name: **Figure 11:** PLRU Tree Indexing - :align: center - :width: 60% - :alt: plru_tree_indexing - - **Figure 11:** PLRU Tree Indexing - -For n-way associative, we require n - 1 internal nodes in the tree. With those nodes, two operations need to be performed efficiently. - -* Promote the accessed entry to be MRU -* Identify which entry to replace (i.e. the PLRU entry) - -.. raw:: html - - Updating the PLRU-Tree - -For a TLB entry which is accessed, the following steps are taken to make it the MRU: - -1. Iterate through each level of the binary tree. -2. Calculate the index of the leftmost child within the current level. Let us call that index the index base. -3. Calculate the shift amount to identify the relevant node based on the level and TLB entry index. -4. Calculate the new value that the node should have in order to make the accessed entry the Most Recently Used (MRU). The new value of the root node is the opposite of the TLB entry index, MSB at the root node, MSB - 1 at node at next level and so on. -5. Assign this new value to the relevant node, ensuring that the hit entry becomes the MRU within the binary tree structure. - -At level 0, no bit of the TLB entry’s index determines the offset from the index base because it’s a root node. At level 1, MSB of entry’s index determines the amount of offset from index base at that level. At level 2, the first two bits of the entry's index from MSB side determine the offset from the index base because there are 4 nodes at the level 2 and so on. - -.. figure:: ../images/update_tree.png - :name: **Figure 12:** Promote Entry to be MRU - :align: center - :width: 82% - :alt: update_tree - - **Figure 12:** Promote Entry to be MRU - -In the above figure entry at index 5, is accessed. To make it MRU entry, every node along the access path should point away from it. Entry 5 is a right child, therefore, its parent plru bit set to 0, its parent is a left child, its grand parent’s plru bit set to 1, and great grandparent’s plru bit set to 0. - -.. raw:: html - - Entry Selection for Replacement - -Every TLB entry is checked for the replacement entry. The following steps are taken: - -1. Iterate through each level of the binary tree. -2. Calculate the index of the leftmost child within the current level. Let us call that index the index base. -3. Calculate the shift amount to identify the relevant node based on the level and TLB entry index. -4. If the corresponding bit of the entry's index matches the value of the node being traversed at the current level, keep the replacement signal high for that entry; otherwise, set the replacement signal to low. - -.. figure:: ../images/replacement_entry.png - :name: **Figure 13:** Possible path traverse for entry selection for replacement - :align: center - :width: 65% - :alt: replacement_entry - - **Figure 13:** Possible path traverse for entry selection for replacement - -Figure shows every possible path that traverses to find out the PLRU entry. If the plru bit at each level matches with the corresponding bit of the entry's index, that’s the next entry to replace. Below Table shows the entry selection for replacement. - -.. raw:: html - -

Table 9: Entry Selection for Reaplacement

- -+-------------------+---------------+----------------------+ -| **Path Traverse** | **PLRU Bits** | **Entry to replace** | -+-------------------+---------------+----------------------+ -| 0 -> 1 -> 3 | 000 | 0 | -| +---------------+----------------------+ -| | 001 | 1 | -+-------------------+---------------+----------------------+ -| 0 -> 1 -> 4 | 010 | 2 | -| +---------------+----------------------+ -| | 011 | 3 | -+-------------------+---------------+----------------------+ -| 0 -> 2 -> 5 | 100 | 4 | -| +---------------+----------------------+ -| | 101 | 5 | -+-------------------+---------------+----------------------+ -| 0 -> 2 -> 6 | 110 | 6 | -| +---------------+----------------------+ -| | 111 | 7 | -+-------------------+---------------+----------------------+ - ------------------------------------ -Shared Translation Lookaside Buffer ------------------------------------ - -The CVA6 shared TLB is structured as a 2-way associative cache, where the virtual address requiring translation is compared with the set indicated by the virtual page number. The shared TLB is looked up in case of an Instruction TLB (ITLB) or data TLB (DTLB) miss, signaled by these TLBs. If the entry is found in the shared TLB set, the respective TLB, whose translation is being requested, is updated. If the entry is not found in the shared TLB, then the processor has to perform a page table walk. Once the processor obtains a PPN corresponding to the VPN, the shared TLB is updated with this information. If the physical page is not found in the page table, it results in a page fault, which is handled by the operating system. The operating system will then place the corresponding physical page in memory. - -The inputs and output signals of the shared TLB are shown in the following two figures. - -.. figure:: ../images/shared_tlb_in_out.png - :name: **Figure 14:** Inputs and outputs of CVA6 shared TLB - :align: center - :width: 60% - :alt: shared_tlb_in_out - - **Figure 14:** Inputs and outputs of CVA6 shared TLB - -.. raw:: html - - Signal Description - -.. raw:: html - -

Table 10: Signal Description of CVA6 shared TLB

- -.. list-table:: - :header-rows: 1 - - * - Signal - - IO - - Connection - - Type - - Description - - * - ``clk_i`` - - in - - Subsystem - - logic - - Subsystem Clock - - * - ``rst_ni`` - - in - - Subsystem - - logic - - Asynchronous reset active low - - * - ``flush_i`` - - in - - Controller - - logic - - TLB flush request - - * - ``enable_translation_i`` - - in - - CSR Regfile - - logic - - CSRs indicate to enable Sv32 - - * - ``en_ld_st_translation_i`` - - in - - CSR Regfile - - logic - - Enable virtual memory translation for load/stores - - * - ``asid_i`` - - in - - CSR Regfile - - logic - - ASID for the lookup - - * - ``itlb_access_i`` - - in - - Cache Subsystem - - logic - - Signal indicating a lookup access in ITLB is being requested. - - * - ``itlb_hit_i`` - - in - - ITLB - - logic - - Signal indicating an ITLB hit - - * - ``itlb_vaddr_i`` - - in - - Cache Subsystem - - logic[31:0] - - Virtual address lookup in ITLB - - * - ``dtlb_access_i`` - - in - - Load/Store Unit - - logic - - Signal indicating a lookup access in DTLB is being requested. - - * - ``dtlb_hit_i`` - - in - - DTLB - - logic - - Signal indicating a DTLB hit - - * - ``dtlb_vaddr_i`` - - in - - Load/Store Unit - - logic[31:0] - - Virtual address lookup in DTLB - - * - ``itlb_update_o`` - - out - - ITLB - - tlb_update_sv32_t - - Tag and content to update ITLB - - * - ``dtlb_update_o`` - - out - - DTLB - - tlb_update_sv32_t - - Tag and content to update DTLB - - * - ``itlb_miss_o`` - - out - - Performance Counter - - logic - - Signal indicating an ITLB miss - - * - ``dtlb_miss_o`` - - out - - Performance Counter - - logic - - Signal indicating a DTLB miss - - * - ``shared_tlb_access_o`` - - out - - PTW - - logic - - Signal indicating a lookup access in shared TLB is being requested - - * - ``shared_tlb_hit_o`` - - out - - PTW - - logic - - Signal indicating a shared TLB hit - - * - ``shared_tlb_vadd_o`` - - out - - PTW - - logic[31:0] - - Virtual address lookup in shared TLB - - * - ``itlb_req_o`` - - out - - PTW - - logic - - ITLB Request Output - - * - ``shared_tlb_update_i`` - - in - - PTW - - tlb_update_sv32_t - - Updated tag and content of shared TLB - -.. raw:: html - - Struct Description - -.. raw:: html - -

Table 11: Shared TLB Update Struct (shared_tag_t)

- -.. list-table:: - :header-rows: 1 - - * - Signal - - Type - - Description - - * - ``is_4M`` - - logic - - Indicates if the shared TLB entry corresponds to a 4MB page. - - * - ``vpn1`` - - logic[9:0] - - Virtual Page Number (VPN) represents the index of PTE in the page table level 1. - - * - ``vpn0`` - - logic[9:0] - - Virtual Page Number (VPN) represents the index of PTE in the page table level 0. - - * - ``asid`` - - logic - - Address Space Identifier (ASID) used to identify different address spaces - -.. raw:: html - - Shared TLB Entry Structure - -Shared TLB is 2-way associative, with a depth of 64. A single entry in the set contains the valid bit, tag and the content. The Tag segment stores details such as the virtual page number (VPN1, VPN0), ASID, and page size (is_4M). The Content field contains two physical page numbers (PPN1, PPN0) along with a number of bits which specify various attributes of the physical page. - -.. figure:: ../images/shared_tlb.png - :name: **Figure 15:** CVA6 Shared TLB Structure - :align: center - :width: 60% - :alt: shared_tlb - - **Figure 15:** CVA6 Shared TLB Structure - -.. raw:: html - - Shared TLB Implementation in CVA6 - -The implementation of a shared TLB in CVA6 is described in the following sections: - -* **ITLB and DTLB Miss:** Prepare a shared TLB lookup if the entry is not found in ITLB or DTLB. -* **Tag Comparison:** Look up the provided virtual address in the shared TLB. -* **Update and Flush:** Flush the shared TLB or update it. -* **Replacement Policies:** First non-valid entry and random replacement policy. - -.. raw:: html - - ITLB and DTLB Miss - -Consider a scenario where an entry is found in the ITLB or DTLB. In this case, there is no need to perform a lookup in the shared TLB since the entry has already been found. Next, there are two scenarios: an ITLB miss or a DTLB miss. - -To identify an ITLB miss, the following conditions need to be fulfilled: - -* Address translation must be enabled. -* There must be an access request to the ITLB. -* The ITLB should indicate an ITLB miss. -* There should be no access request to the DTLB. - -During an ITLB miss, access is granted to read the tag and content of the shared TLB from their respective sram. The address for reading the tag and content of the shared TLB entry is calculated using the virtual address for which translation is not found in the ITLB. The ITLB miss is also explicitly indicated by the shared TLB. A request for shared TLB access is initiated. - -To identify the DTLB miss, the following conditions need to be fulfilled: - -* Address translation for load and stores must be enabled. -* There must be an access request to the DTLB. -* The DTLB should indicate a DTLB miss. - -In the case of a DTLB miss, the same logic is employed as described for an ITLB miss. - -.. raw:: html - - Tag Comparison - -Shared TLB lookup for a hit occurs under the same conditions as described for the TLB modules used as ITLB and DTLB. However, there are some distinctions. In both the ITLB and DTLB, the virtual address requiring translation is compared against all TLB entries. In contrast, the shared TLB only compares the tag and content of the set indicated by the provided virtual page number. The index of the set is extracted from VPN0 of the requested virtual address. Given that the shared TLB is 2-way associative, each set contains two entries. Consequently, both of these entries are compared. Below figure illustrates how the set is opted for the lookup. - -.. figure:: ../images/shared_tlb_set.png - :name: **Figure 16:** Set opted for lookup in shared TLB - :align: center - :width: 60% - :alt: shared_tlb_set - - **Figure 16:** Set opted for lookup in shared TLB - -.. raw:: html - - Update and Flush - -Differing from the ITLB and DTLB, a specific virtual address or addressing space cannot be flushed in the shared TLB. When SFENCE.VMA is committed, all entries in the shared TLB are invalidated. (Cases of SFENCE.VMA should also be added in shared TLB) - -.. raw:: html - - Updating Shared TLB - -When the Page Table Walker signals a valid update request, the shared TLB is updated by selecting an entry through the replacement policy and marking it as valid. This also triggers the writing of the new tag and content to the respective SRAM. - -.. raw:: html - - Replacement Policy Implemented in CVA6 Shared TLB - -In CVA6's shared TLB, two replacement policies are employed for replacements based on a specific condition. These replacement policies select the entry within the set indicated by the virtual page number. The two policies are: - -* First non-valid encounter replacement policy -* Random replacement policy - -First replacement policy failed if all ways are valid. Therefore, a random replacement policy is opted for. - -.. raw:: html - - First non-valid encounter replacement policy - -The module implemented in CVA6 to find the first non-valid entry in the shared TLB is the Leading Zero Counter (LZC). It takes three parameters as input: - -1. **WIDTH:** The width of the input vector. -2. **MODE:** Mode selection - 0 for trailing zero, 1 for leading zero. -3. **CNT WIDTH:** Width of the output signal containing the zero count. - -The input signal is the vector to be counted, and the output represents the count of trailing/leading zeros. If all bits in the input vector are zero, it will also be indicated. - -When initializing the module, the width of the input vector is set to the number of shared TLB ways. The trailing zero counter mode is selected. The vector of valid bits is set as the input vector, but with negation. This is because we want the index of the first non-valid entry, and LZC returns the count of trailing zeros, which actually corresponds to the index of the first occurrence of 1 from the least significant bit (LSB). if there is at least one non-valid entry, that entry is opted for the replacement, and If not then this is signaled by LZC. - -.. figure:: ../images/LZC.png - :name: **Figure 17:** Replacement of First invalid entry. - :align: center - :width: 60% - :alt: LZC - - **Figure 17:** Replacement of First invalid entry. - -.. raw:: html - - Random replacement policy - -If all ways are valid, a random replacement policy is employed for the replacement process. The Linear Feedback Shift Register (LFSR) is utilized to select the replacement entry randomly. LFSR is commonly used in generating sequences of pseudo-random numbers. When the enable signal is active, the current state of the LFSR undergoes a transformation. Specifically, the state is shifted right by one bit, and the result is combined with a predetermined masking pattern. This masking pattern is derived from the predefined “Masks” array, introducing a non-linear behavior to the sequence generation of the LFSR. The masking process involves XOR operations between the shifted state bits and specific pattern bits, contributing to the complexity and unpredictability of the generated sequence. - -.. figure:: ../images/RR.png - :name: **Figure 18:** Entry selection for replacement using LFSR - :align: center - :width: 95% - :alt: RR - - **Figure 18:** Entry selection for replacement using LFSR - ------------------ -Page Table Walker ------------------ - -The "CVA6 Page Table Walker (PTW) for MMU Sv32" is a hardware module developed for the CV32A6 processor architecture, designed to facilitate the translation of virtual addresses into physical addresses, a crucial task in memory access management. - -.. figure:: ../images/ptw_in_out.png - :name: **Figure 19:** Input and Outputs of Page Table Walker - :align: center - :width: 60% - :alt: ptw_in_out - - **Figure 19:** Input and Outputs of Page Table Walker - -.. raw:: html - - Operation of PTW Module - -The PTW module operates through various states, each with its specific function, such as handling memory access requests, validating page table entries, and responding to errors. - -.. raw:: html - - Key Features and Capabilities - -Key features of this PTW module include support for two levels of page tables (LVL1 and LVL2) in the Sv32 standard, accommodating instruction and data page table walks. It rigorously validates and verifies page table entries (PTEs) to ensure translation accuracy and adherence to access permissions. This module seamlessly integrates with the CV32A6 processor's memory management unit (MMU), which governs memory access control. It also takes into account global mapping, access flags, and privilege levels during the translation process, ensuring that memory access adheres to the processor's security and privilege settings. - -.. raw:: html - - Exception Handling - -In addition to its translation capabilities, the PTW module is equipped to detect and manage errors, including page-fault exceptions and access exceptions, contributing to the robustness of the memory access system. It works harmoniously with physical memory protection (PMP) configurations, a critical aspect of modern processors' memory security. Moreover, the module efficiently processes virtual addresses, generating corresponding physical addresses, all while maintaining speculative translation, a feature essential for preserving processor performance during memory access operations. - -.. raw:: html - - Signal Description - -.. raw:: html - -

Table 12: Signal Description of PTW

- -.. list-table:: - :header-rows: 1 - - * - Signal - - IO - - Connection - - Type - - Description - - * - ``clk_i`` - - in - - Subsystem - - logic - - Subsystem Clock - - * - ``rst_ni`` - - in - - Subsystem - - logic - - Asynchronous reset active low - - * - ``flush_i`` - - in - - Controller - - logic - - Sfence Committed - - * - ``ptw_active_o`` - - out - - MMU - - logic - - Output signal indicating whether the Page Table Walker (PTW) is currently active - - * - ``walking_instr_o`` - - out - - MMU - - logic - - Indicating it's an instruction page table walk or not - - * - ``ptw_error_o`` - - out - - MMU - - logic - - Output signal indicating that an error occurred during PTW operation - - * - ``ptw_access_exception_o`` - - out - - MMU - - logic - - Output signal indicating that a PMP (Physical Memory Protection) access exception occurred during PTW operation. - - * - ``lsu_is_store_i`` - - in - - Store Unit - - logic - - Input signal indicating whether the translation was triggered by a store operation. - - * - ``req_port_i`` - - in - - Cache Subsystem - - dcache_req_o_t - - D Cache Data Requests - - * - ``req_port_o`` - - out - - Cache Subsystem / Perf Counter - - dcache_req_u_t - - D Cache Data Response - - * - ``shared_tlb_update_o`` - - out - - Shared TLB - - tlb_update_sv32_t - - Updated tag and content of shared TLB - - * - ``update_vaddr_o`` - - out - - MMU - - logic[riscv::VLEN-1:0] - - Updated VADDR from shared TLB - - * - ``asid_i`` - - in - - CSR RegFile - - logic[ASID_WIDTH-1:0] - - ASID for the lookup - - * - ``shared_tlb_access_i`` - - in - - Shared TLB - - logic - - Access request of shared TLB - - * - ``shared_tlb_hit_i`` - - in - - Shared TLB - - logic - - Indicate shared TLB hit - - * - ``shared_tlb_vaddr_i`` - - in - - Shared TLB - - logic[riscv::VLEN-1:0] - - Virtual Address from shared TLB - - * - ``itlb_req_i`` - - in - - Shared TLB - - logic - - Indicate request to ITLB - - * - ``satp_ppn_i`` - - in - - CSR RegFile - - logic[riscv::PPNW-1:0] - - PPN of top level page table from SATP register - - * - ``mxr_i`` - - in - - CSR RegFile - - logic - - Make Executable Readable bit in xSTATUS CSR register - - * - ``shared_tlb_miss_o`` - - out - - OPEN - - logic - - Indicate a shared TLB miss - - * - ``pmpcfg_i`` - - in - - CSR RegFile - - riscv::pmpcfg_t[15:0] - - PMP configuration - - * - ``pmpaddr_i`` - - in - - CSR RegFile - - logic[15:0][riscv::PLEN-3:0] - - PMP Address - - * - ``bad_paddr_o`` - - out - - MMU - - logic[riscv::PLEN-1:0] - - Bad Physical Address in case of access exception - -.. raw:: html - - Struct Description - -.. raw:: html - -

Table 13: D Cache Response Struct (dcache_req_i_t)

- -.. list-table:: - :header-rows: 1 - - * - Signal - - Type - - Description - - * - ``address_index`` - - logic [DCACHE_INDEX_WIDTH-1:0] - - Index of the Dcache Line - - * - ``address_tag`` - - logic [DCACHE_TAG_WIDTH-1:0] - - Tag of the Dcache Line - - * - ``data_wdata`` - - riscv::xlen_t - - Data to write in the Dcache - - * - ``data_wuser`` - - logic [DCACHE_USER_WIDTH-1:0] - - data_wuser - - * - ``data_req`` - - logic - - Data Request - - * - ``data_we`` - - logic - - Data Write enabled - - * - ``data_be`` - - logic [(riscv::XLEN/8)-1:0] - - Data Byte enable - - * - ``data_size`` - - logic [1:0] - - Size of data - - * - ``data_id`` - - logic [DCACHE_TID_WIDTH-1:0] - - Data ID - - * - ``kill_req`` - - logic - - Kill the D cache request - - * - ``tag_valid`` - - logic - - Indicate that teh tag is valid - -.. raw:: html - -

Table 14: D Cache Request Struct (dcache_req_o_t)

- -.. list-table:: - :header-rows: 1 - - * - Signal - - Type - - Description - - * - ``data_gnt`` - - logic - - Grant of data is given in response to the data request - - * - ``data_rvalid`` - - logic - - Indicate that data is valid which is sent by D cache - - * - ``data_rid`` - - logic [DCACHE_TID_WIDTH-1:0] - - Requested data ID - - * - ``data_rdata`` - - riscv::xlen_t - - Data from D cache - - * - ``data_ruser`` - - logic [DCACHE_USER_WIDTH-1:0] - - Requested data user - -.. raw:: html - - PTW State Machine - -Page Table Walker is implemented as a finite state machine. It listens to shared TLB for incoming translation requests. If there is a shared TLB miss, it saves the virtual address and starts the page table walk. Page table walker transition between 7 states in CVA6. - -* **IDLE:** The initial state where the PTW is awaiting a trigger, often a Shared TLB miss, to initiate a memory access request. -* **WAIT_GRANT:** Request memory access and wait for data grant -* **PTE_LOOKUP:** Once granted access, the PTW examines the valid Page Table Entry (PTE), checking attributes to determine the appropriate course of action. -* **PROPOGATE_ERROR:** If the PTE is invalid, this state handles the propagation of an error, often leading to a page-fault exception due to non-compliance with access conditions -* **PROPOGATE_ACCESS_ERROR:** Propagate access fault if access is not allowed from a PMP perspective -* **WAIT_RVALID:** After processing a PTE, the PTW waits for a valid data signal, indicating that relevant data is ready for further processing. -* **LATENCY:** Introduces a delay to account for synchronization or timing requirements between states. - -.. figure:: ../images/ptw_state_diagram.png - :name: **Figure 20:** State Machine Diagram of CVA6 PTW - :align: center - :width: 95% - :alt: ptw_state_diagram - - **Figure 20:** State Machine Diagram of CVA6 PTW - -.. raw:: html - - IDLE state - -In the IDLE state of the Page Table Walker (PTW) finite state machine, the system awaits a trigger to initiate the page table walk process. This trigger is often prompted by a Shared Translation Lookaside Buffer (TLB) miss, indicating that the required translation is not present in the shared TLB cache. The PTW's behavior in this state is explained as follows: - -1. The top-most page table is selected for the page table walk. In the case of SV32, which implements a two-level page table, the level 1 page table is chosen. -2. In the IDLE state, translations are assumed to be invalid in all addressing spaces. -3. The signal indicating the instruction page table walk is set to 0. -4. A conditional check is performed: if there is a shared TLB access request and the entry is not found in the shared TLB (indicating a shared TLB miss), the following steps are executed: - - a. The address of the desired Page Table Entry within the level 1 page table is calculated by multiplying the Physical Page Number (PPN) of the level 1 page table from the SATP register by the page size (4kB). This result is then added to the product of the Virtual Page Number (VPN1), and the size of a page table entry(4 bytes). - -.. figure:: ../images/ptw_idle.png - :name: **Figure 21:** Address of Desired PTE at Level 1 - :align: center - :width: 68% - :alt: ptw_idle - - **Figure 21:** Address of Desired PTE at Level 1 - -.. _example: - - b. The signal indicating whether it's an instruction page table walk is updated based on the ITLB miss. - c. The ASID and virtual address are saved for the page table walk. - d. A shared TLB miss is indicated. - -.. raw:: html - - WAIT GRANT state - -In the **WAIT_GRANT** state of the Page Table Walker's finite state machine, a data request is sent to retrieve memory information. It waits for a data grant signal from the Dcache controller, remaining in this state until granted. Once granted, it activates a tag valid signal, marking data validity. The state then transitions to "PTE_LOOKUP" for page table entry lookup. - -.. raw:: html - - PTE LOOKUP state - -In the **PTE_LOOKUP** state of the Page Table Walker (PTW) finite state machine, the PTW performs the actual lookup and evaluation of the page table entry (PTE) based on the virtual address translation. The behavior and operations performed in this state are detailed as follows: - -1. The state waits for a valid signal indicating that the data from the memory subsystem, specifically the page table entry, is available for processing. -2. Upon receiving the valid signal, the PTW proceeds with examining the retrieved page table entry to determine its properties and validity. -3. The state checks if the global mapping bit in the PTE is set, and if so, sets the global mapping signal to indicate that the translation applies globally across all address spaces. -4. The state distinguishes between two cases: Invalid PTE and Valid PTE. - - a. If the valid bit of the PTE is not set, or if the PTE has reserved RWX field encodings, it signifies an Invalid PTE. In such cases, the state transitions to the "PROPAGATE_ERROR" state, indicating a page-fault exception due to an invalid translation. - -.. figure:: ../images/ptw_pte_1.png - :name: **Figure 22:** Invalid PTE and reserved RWX encoding leads to page fault - :align: center - :width: 70% - :alt: ptw_pte_1 - - **Figure 22:** Invalid PTE and reserved RWX encoding leads to page fault - -.. _example1: - - b. If the PTE is valid, the state advances to the "LATENCY" state, indicating a period of processing latency. Additionally, if the "read" flag (pte.r) or the "execute" flag (pte.x) is set, the PTE is considered valid. - -5. Within the Valid PTE scenario, the state performs further checks based on whether the translation is intended for instruction fetching or data access: - - a. For instruction page table walk, if the page is not executable (pte.x is not set) or not marked as accessible (pte.a is not set), the state transitions to the "PROPAGATE_ERROR" state. - -.. figure:: ../images/ptw_iptw.png - :name: **Figure 23:** For Instruction Page Table Walk - :align: center - :width: 70% - :alt: ptw_iptw - - **Figure 23:** For Instruction Page Table Walk - -.. _example2: - - b. For data page table walk, the state checks if the page is readable (pte.r is set) or if the page is executable only but made readable by setting the MXR bit in xSTATUS CSR register. If either condition is met, it indicates a valid translation. If not, the state transitions to the "PROPAGATE_ERROR" state. - -.. figure:: ../images/ptw_dptw.png - :name: **Figure 24:** Data Access Page Table Walk - :width: 70% - :alt: ptw_dptw - - **Figure 24:** Data Access Page Table Walk - -.. _example3: - - c. If the access is intended for storing data, additional checks are performed: If the page is not writable (pte.w is not set) or if it is not marked as dirty (pte.d is not set), the state transitions to the "PROPAGATE_ERROR" state. - -.. figure:: ../images/ptw_dptw_s.png - :name: **Figure 25:** Data Access Page Table Walk, Store requested - :align: center - :width: 70% - :alt: ptw_dptw_s - - **Figure 25:** Data Access Page Table Walk, Store requested - -6. The state also checks for potential misalignment issues in the translation: If the current page table level is the first level (LVL1) and if the PPN0 of in PTE is not zero, it indicates a misaligned superpage, leading to a transition to the "PROPAGATE_ERROR" state. - -.. figure:: ../images/ptw_mis_sup.png - :name: **Figure 26:** Misaligned Superpage Check - :align: center - :width: 70% - :alt: ptw_mis_sup - - **Figure 26:** Misaligned Superpage Check - -7. If the PTE is valid but the page is neither readable nor executable, the PTW recognizes the PTE as a pointer to the next level of the page table, indicating that additional translation information can be found in the referenced page table at a lower level. -8. If the current page table level is the first level (LVL1), the PTW proceeds to switch to the second level (LVL2) page table, updating the next level pointer and calculating the address for the next page table entry using the Physical Page Number from the PTE and the index of the level 2 page table from virtual address. - -.. figure:: ../images/ptw_nlvl.png - :name: **Figure 27:** Address of desired PTE at next level of Page Table - :align: center - :width: 70% - :alt: ptw_nlvl - - **Figure 27:** Address of desired PTE at next level of Page Table - -9. The state then transitions to the "WAIT_GRANT" state, indicating that the PTW is awaiting the grant signal to proceed with requesting the next level page table entry. -10. If the current level is already the second level (LVL2), an error is flagged, and the state transitions to the "PROPAGATE_ERROR" state, signifying an unexpected situation where the PTW is already at the last level page table. -11. If the translation access is found to be restricted by the Physical Memory Protection (PMP) settings (allow_access is false), the state updates the shared TLB update signal to indicate that the TLB entry should not be updated. Additionally, the saved address for the page table walk is restored to its previous value, and the state transitions to the "PROPAGATE_ACCESS_ERROR" state. -12. Lastly, if the data request for the page table entry was granted, the state indicates to the cache subsystem that the tag associated with the data is now valid. - -.. figure:: ../images/ptw_pte_flowchart.png - :name: **Figure 28:** Flow Chart of PTE LOOKUP State - :align: center - :alt: ptw_pte_flowchart - - **Figure 28:** Flow Chart of PTE LOOKUP State - -.. raw:: html - - PROPAGATE ERROR state - -This state indicates a detected error in the page table walk process, and an error signal is asserted to indicate the Page Table Walker's error condition, triggering a transition to the "LATENCY" state for error signal propagation. - -.. raw:: html - - PROPAGATE ACCESS ERROR state - -This state indicates a detected access error in the page table walk process, and an access error signal is asserted to indicate the Page Table Walker's access error condition, triggering a transition to the "LATENCY" state for access error signal propagation. - -.. raw:: html - - WAIT RVALID state - -This state waits until it gets the "read valid" signal, and when it does, it's ready to start a new page table walk. - -.. raw:: html - - LATENCY state - -The LATENCY state introduces a latency period to allow for necessary system actions or signals to stabilize. After the latency period, the FSM transitions back to the IDLE state, indicating that the system is prepared for a new translation request. - -.. raw:: html - - Flush Scenario - -The first step when a flush is triggered is to check whether the Page Table Entry (PTE) lookup process is currently in progress. If the PTW (Page Table Walker) module is indeed in the middle of a PTE lookup operation, the code then proceeds to evaluate a specific aspect of this operation. - -* **Check for Data Validity (rvalid):** Within the PTE lookup operation, it's important to ensure that the data being used for the translation is valid. In other words, the code checks whether the "rvalid" signal (which likely indicates the validity of the data) is not active. If the data is not yet valid, it implies that the PTW module is waiting for the data to become valid before completing the lookup. In such a case, the code takes appropriate action to wait for the data to become valid before proceeding further. - -* **Check for Waiting on Grant:** The second condition the code checks for during a flush scenario is whether the PTW module is currently waiting for a "grant." This "grant" signal is typically used to indicate permission or authorization to proceed with an operation. If the PTW module is indeed in a state of waiting for this grant signal, it implies that it requires authorization before continuing its task. - - * **Waiting for Grant:** If the PTW module is in a state of waiting for the grant signal, the code ensures that it continues to wait for the grant signal to be asserted before proceeding further. - -* **Return to Idle State if Neither Condition is Met:** After evaluating the above two conditions, the code determines whether either of these conditions is true. If neither of these conditions applies, it suggests that the PTW module can return to its idle state, indicating that it can continue normal operations without any dependencies on the flush condition. diff --git a/docs/04_cv32a65x/design/source/parameters_cv32a65x.rst b/docs/04_cv32a65x/design/source/parameters_cv32a65x.rst deleted file mode 100644 index 91c64d617b..0000000000 --- a/docs/04_cv32a65x/design/source/parameters_cv32a65x.rst +++ /dev/null @@ -1,301 +0,0 @@ -.. - Copyright 2024 Thales DIS France SAS - Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); - you may not use this file except in compliance with the License. - SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 - You may obtain a copy of the License at https://solderpad.org/licenses/ - - Original Author: Jean-Roch COULON - Thales - -.. _cv32a65x_PARAMETERS: - -.. list-table:: cv32a65x parameter configuration - :header-rows: 1 - - * - Name - - description - - Value - - * - XLEN - - General Purpose Register Size (in bits) - - 32 - - * - RVA - - Atomic RISC-V extension - - False - - * - RVB - - Bit manipulation RISC-V extension - - True - - * - RVV - - Vector RISC-V extension - - False - - * - RVC - - Compress RISC-V extension - - True - - * - RVH - - Hypervisor RISC-V extension - - False - - * - RVZCB - - Zcb RISC-V extension - - True - - * - RVZCMP - - Zcmp RISC-V extension - - False - - * - RVZiCond - - Zicond RISC-V extension - - False - - * - RVF - - Floating Point - - False - - * - RVD - - Floating Point - - False - - * - XF16 - - Non standard 16bits Floating Point extension - - False - - * - XF16ALT - - Non standard 16bits Floating Point Alt extension - - False - - * - XF8 - - Non standard 8bits Floating Point extension - - False - - * - XFVec - - Non standard Vector Floating Point extension - - False - - * - PerfCounterEn - - Perf counters - - False - - * - MmuPresent - - MMU - - False - - * - RVS - - Supervisor mode - - False - - * - RVU - - User mode - - False - - * - DebugEn - - Debug support - - False - - * - DmBaseAddress - - Base address of the debug module - - 0x0 - - * - HaltAddress - - Address to jump when halt request - - 0x800 - - * - ExceptionAddress - - Address to jump when exception - - 0x808 - - * - TvalEn - - Tval Support Enable - - False - - * - NrPMPEntries - - PMP entries number - - 8 - - * - PMPCfgRstVal - - PMP CSR configuration reset values - - [0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0] - - * - PMPAddrRstVal - - PMP CSR address reset values - - [0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0] - - * - PMPEntryReadOnly - - PMP CSR read-only bits - - 0 - - * - NrNonIdempotentRules - - PMA non idempotent rules number - - 2 - - * - NonIdempotentAddrBase - - PMA NonIdempotent region base address - - [0b0, 0b0] - - * - NonIdempotentLength - - PMA NonIdempotent region length - - [0b0, 0b0] - - * - NrExecuteRegionRules - - PMA regions with execute rules number - - 3 - - * - ExecuteRegionAddrBase - - PMA Execute region base address - - [0x80000000, 0x10000, 0x0] - - * - ExecuteRegionLength - - PMA Execute region address base - - [0x40000000, 0x10000, 0x1000] - - * - NrCachedRegionRules - - PMA regions with cache rules number - - 1 - - * - CachedRegionAddrBase - - PMA cache region base address - - [0x80000000] - - * - CachedRegionLength - - PMA cache region rules - - [0x40000000] - - * - CvxifEn - - CV-X-IF coprocessor interface enable - - True - - * - NOCType - - NOC bus type - - config_pkg::NOC_TYPE_AXI4_ATOP - - * - AxiAddrWidth - - AXI address width - - 64 - - * - AxiDataWidth - - AXI data width - - 64 - - * - AxiIdWidth - - AXI ID width - - 4 - - * - AxiUserWidth - - AXI User width - - 32 - - * - AxiBurstWriteEn - - AXI burst in write - - False - - * - MemTidWidth - - TODO - - 4 - - * - IcacheByteSize - - Instruction cache size (in bytes) - - 2048 - - * - IcacheSetAssoc - - Instruction cache associativity (number of ways) - - 2 - - * - IcacheLineWidth - - Instruction cache line width - - 128 - - * - DCacheType - - Cache Type - - config_pkg::HPDCACHE - - * - DcacheIdWidth - - Data cache ID - - 1 - - * - DcacheByteSize - - Data cache size (in bytes) - - 32768 - - * - DcacheSetAssoc - - Data cache associativity (number of ways) - - 8 - - * - DcacheLineWidth - - Data cache line width - - 128 - - * - DataUserEn - - User field on data bus enable - - 0 - - * - WtDcacheWbufDepth - - Write-through data cache write buffer depth - - 2 - - * - FetchUserEn - - User field on fetch bus enable - - 0 - - * - FetchUserWidth - - Width of fetch user field - - 32 - - * - FpgaEn - - Is FPGA optimization of CV32A6 - - False - - * - NrCommitPorts - - Number of commit ports - - 1 - - * - NrLoadPipeRegs - - Load cycle latency number - - 0 - - * - NrStorePipeRegs - - Store cycle latency number - - 0 - - * - NrScoreboardEntries - - Scoreboard length - - 4 - - * - NrLoadBufEntries - - Load buffer entry buffer - - 2 - - * - MaxOutstandingStores - - Maximum number of outstanding stores - - 7 - - * - RASDepth - - Return address stack depth - - 2 - - * - BTBEntries - - Branch target buffer entries - - 0 - - * - BHTEntries - - Branch history entries - - 32 - - * - InstrTlbEntries - - MMU instruction TLB entries - - 2 - - * - DataTlbEntries - - MMU data TLB entries - - 2 - - * - UseSharedTlb - - MMU option to use shared TLB - - True - - * - SharedTlbDepth - - MMU depth of shared TLB - - 64 diff --git a/docs/04_cv32a65x/design/source/port_alu.rst b/docs/04_cv32a65x/design/source/port_alu.rst deleted file mode 100644 index 6da09d56b7..0000000000 --- a/docs/04_cv32a65x/design/source/port_alu.rst +++ /dev/null @@ -1,51 +0,0 @@ -.. - Copyright 2024 Thales DIS France SAS - Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); - you may not use this file except in compliance with the License. - SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 - You may obtain a copy of the License at https://solderpad.org/licenses/ - - Original Author: Jean-Roch COULON - Thales - -.. _CVA6_alu_ports: - -.. list-table:: **alu module** IO ports - :header-rows: 1 - - * - Signal - - IO - - Description - - connexion - - Type - - * - ``clk_i`` - - in - - Subsystem Clock - - SUBSYSTEM - - logic - - * - ``rst_ni`` - - in - - Asynchronous reset active low - - SUBSYSTEM - - logic - - * - ``fu_data_i`` - - in - - FU data needed to execute instruction - - ISSUE_STAGE - - fu_data_t - - * - ``result_o`` - - out - - ALU result - - ISSUE_STAGE - - logic[CVA6Cfg.XLEN-1:0] - - * - ``alu_branch_res_o`` - - out - - ALU branch compare result - - branch_unit - - logic - - diff --git a/docs/04_cv32a65x/design/source/port_bht.rst b/docs/04_cv32a65x/design/source/port_bht.rst deleted file mode 100644 index 3661996fc9..0000000000 --- a/docs/04_cv32a65x/design/source/port_bht.rst +++ /dev/null @@ -1,57 +0,0 @@ -.. - Copyright 2024 Thales DIS France SAS - Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); - you may not use this file except in compliance with the License. - SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 - You may obtain a copy of the License at https://solderpad.org/licenses/ - - Original Author: Jean-Roch COULON - Thales - -.. _CVA6_bht_ports: - -.. list-table:: **bht module** IO ports - :header-rows: 1 - - * - Signal - - IO - - Description - - connexion - - Type - - * - ``clk_i`` - - in - - Subsystem Clock - - SUBSYSTEM - - logic - - * - ``rst_ni`` - - in - - Asynchronous reset active low - - SUBSYSTEM - - logic - - * - ``vpc_i`` - - in - - Virtual PC - - CACHE - - logic[CVA6Cfg.VLEN-1:0] - - * - ``bht_update_i`` - - in - - Update bht with resolved address - - EXECUTE - - bht_update_t - - * - ``bht_prediction_o`` - - out - - Prediction from bht - - FRONTEND - - ariane_pkg::bht_prediction_t[CVA6Cfg.INSTR_PER_FETCH-1:0] - -Due to cv32a65x configuration, some ports are tied to a static value. These ports do not appear in the above table, they are listed below - -| For any HW configuration, -| ``flush_bp_i`` input is tied to 0 -| As DebugEn = False, -| ``debug_mode_i`` input is tied to 0 - diff --git a/docs/04_cv32a65x/design/source/port_branch_unit.rst b/docs/04_cv32a65x/design/source/port_branch_unit.rst deleted file mode 100644 index f8ba461865..0000000000 --- a/docs/04_cv32a65x/design/source/port_branch_unit.rst +++ /dev/null @@ -1,105 +0,0 @@ -.. - Copyright 2024 Thales DIS France SAS - Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); - you may not use this file except in compliance with the License. - SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 - You may obtain a copy of the License at https://solderpad.org/licenses/ - - Original Author: Jean-Roch COULON - Thales - -.. _CVA6_branch_unit_ports: - -.. list-table:: **branch_unit module** IO ports - :header-rows: 1 - - * - Signal - - IO - - Description - - connexion - - Type - - * - ``clk_i`` - - in - - Subsystem Clock - - SUBSYSTEM - - logic - - * - ``rst_ni`` - - in - - Asynchronous reset active low - - SUBSYSTEM - - logic - - * - ``fu_data_i`` - - in - - FU data needed to execute instruction - - ISSUE_STAGE - - fu_data_t - - * - ``pc_i`` - - in - - Instruction PC - - ISSUE_STAGE - - logic[CVA6Cfg.VLEN-1:0] - - * - ``is_compressed_instr_i`` - - in - - Instruction is compressed - - ISSUE_STAGE - - logic - - * - ``fu_valid_i`` - - in - - any functional unit is valid, check that there is no accidental mis-predict - - TO_BE_COMPLETED - - logic - - * - ``branch_valid_i`` - - in - - Branch unit instruction is valid - - ISSUE_STAGE - - logic - - * - ``branch_comp_res_i`` - - in - - ALU branch compare result - - ALU - - logic - - * - ``branch_result_o`` - - out - - Brach unit result - - ISSUE_STAGE - - logic[CVA6Cfg.VLEN-1:0] - - * - ``branch_predict_i`` - - in - - Information of branch prediction - - ISSUE_STAGE - - branchpredict_sbe_t - - * - ``resolved_branch_o`` - - out - - Signaling that we resolved the branch - - ISSUE_STAGE - - bp_resolve_t - - * - ``resolve_branch_o`` - - out - - Branch is resolved, new entries can be accepted by scoreboard - - ID_STAGE - - logic - - * - ``branch_exception_o`` - - out - - Branch exception out - - TO_BE_COMPLETED - - exception_t - -Due to cv32a65x configuration, some ports are tied to a static value. These ports do not appear in the above table, they are listed below - -| As RVH = False, -| ``v_i`` input is tied to 0 -| As DebugEn = False, -| ``debug_mode_i`` input is tied to 0 - diff --git a/docs/04_cv32a65x/design/source/port_btb.rst b/docs/04_cv32a65x/design/source/port_btb.rst deleted file mode 100644 index bda7f8244b..0000000000 --- a/docs/04_cv32a65x/design/source/port_btb.rst +++ /dev/null @@ -1,57 +0,0 @@ -.. - Copyright 2024 Thales DIS France SAS - Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); - you may not use this file except in compliance with the License. - SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 - You may obtain a copy of the License at https://solderpad.org/licenses/ - - Original Author: Jean-Roch COULON - Thales - -.. _CVA6_btb_ports: - -.. list-table:: **btb module** IO ports - :header-rows: 1 - - * - Signal - - IO - - Description - - connexion - - Type - - * - ``clk_i`` - - in - - Subsystem Clock - - SUBSYSTEM - - logic - - * - ``rst_ni`` - - in - - Asynchronous reset active low - - SUBSYSTEM - - logic - - * - ``vpc_i`` - - in - - Virtual PC - - CACHE - - logic[CVA6Cfg.VLEN-1:0] - - * - ``btb_update_i`` - - in - - Update BTB with resolved address - - EXECUTE - - btb_update_t - - * - ``btb_prediction_o`` - - out - - BTB Prediction - - FRONTEND - - btb_prediction_t[CVA6Cfg.INSTR_PER_FETCH-1:0] - -Due to cv32a65x configuration, some ports are tied to a static value. These ports do not appear in the above table, they are listed below - -| For any HW configuration, -| ``flush_bp_i`` input is tied to 0 -| As DebugEn = False, -| ``debug_mode_i`` input is tied to 0 - diff --git a/docs/04_cv32a65x/design/source/port_commit_stage.rst b/docs/04_cv32a65x/design/source/port_commit_stage.rst deleted file mode 100644 index d710b605ec..0000000000 --- a/docs/04_cv32a65x/design/source/port_commit_stage.rst +++ /dev/null @@ -1,177 +0,0 @@ -.. - Copyright 2024 Thales DIS France SAS - Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); - you may not use this file except in compliance with the License. - SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 - You may obtain a copy of the License at https://solderpad.org/licenses/ - - Original Author: Jean-Roch COULON - Thales - -.. _CVA6_commit_stage_ports: - -.. list-table:: **commit_stage module** IO ports - :header-rows: 1 - - * - Signal - - IO - - Description - - connexion - - Type - - * - ``clk_i`` - - in - - Subsystem Clock - - SUBSYSTEM - - logic - - * - ``rst_ni`` - - in - - Asynchronous reset active low - - SUBSYSTEM - - logic - - * - ``halt_i`` - - in - - Request to halt the core - - CONTROLLER - - logic - - * - ``flush_dcache_i`` - - in - - request to flush dcache, also flush the pipeline - - CACHE - - logic - - * - ``exception_o`` - - out - - TO_BE_COMPLETED - - EX_STAGE - - exception_t - - * - ``commit_instr_i`` - - in - - The instruction we want to commit - - ISSUE_STAGE - - scoreboard_entry_t[CVA6Cfg.NrCommitPorts-1:0] - - * - ``commit_ack_o`` - - out - - Acknowledge that we are indeed committing - - ISSUE_STAGE - - logic[CVA6Cfg.NrCommitPorts-1:0] - - * - ``commit_macro_ack_o`` - - out - - Acknowledge that we are indeed committing - - CSR_REGFILE - - logic[CVA6Cfg.NrCommitPorts-1:0] - - * - ``waddr_o`` - - out - - Register file write address - - ISSUE_STAGE - - logic[CVA6Cfg.NrCommitPorts-1:0][4:0] - - * - ``wdata_o`` - - out - - Register file write data - - ISSUE_STAGE - - logic[CVA6Cfg.NrCommitPorts-1:0][CVA6Cfg.XLEN-1:0] - - * - ``we_gpr_o`` - - out - - Register file write enable - - ISSUE_STAGE - - logic[CVA6Cfg.NrCommitPorts-1:0] - - * - ``we_fpr_o`` - - out - - Floating point register enable - - ISSUE_STAGE - - logic[CVA6Cfg.NrCommitPorts-1:0] - - * - ``pc_o`` - - out - - TO_BE_COMPLETED - - FRONTEND_CSR_REGFILE - - logic[CVA6Cfg.VLEN-1:0] - - * - ``csr_op_o`` - - out - - Decoded CSR operation - - CSR_REGFILE - - fu_op - - * - ``csr_wdata_o`` - - out - - Data to write to CSR - - CSR_REGFILE - - logic[CVA6Cfg.XLEN-1:0] - - * - ``csr_rdata_i`` - - in - - Data to read from CSR - - CSR_REGFILE - - logic[CVA6Cfg.XLEN-1:0] - - * - ``csr_exception_i`` - - in - - Exception or interrupt occurred in CSR stage (the same as commit) - - CSR_REGFILE - - exception_t - - * - ``commit_lsu_o`` - - out - - Commit the pending store - - EX_STAGE - - logic - - * - ``commit_lsu_ready_i`` - - in - - Commit buffer of LSU is ready - - EX_STAGE - - logic - - * - ``commit_tran_id_o`` - - out - - Transaction id of first commit port - - ID_STAGE - - logic[CVA6Cfg.TRANS_ID_BITS-1:0] - - * - ``no_st_pending_i`` - - in - - no store is pending - - EX_STAGE - - logic - - * - ``commit_csr_o`` - - out - - Commit the pending CSR instruction - - EX_STAGE - - logic - - * - ``flush_commit_o`` - - out - - Request a pipeline flush - - CONTROLLER - - logic - -Due to cv32a65x configuration, some ports are tied to a static value. These ports do not appear in the above table, they are listed below - -| As RVF = 0, -| ``dirty_fp_state_o`` output is tied to 0 -| ``csr_write_fflags_o`` output is tied to 0 -| As DebugEn = False, -| ``single_step_i`` input is tied to 0 -| As RVA = False, -| ``amo_resp_i`` input is tied to 0 -| ``amo_valid_commit_o`` output is tied to 0 -| As FenceEn = 0, -| ``fence_i_o`` output is tied to 0 -| ``fence_o`` output is tied to 0 -| As RVS = False, -| ``sfence_vma_o`` output is tied to 0 -| As RVH = False, -| ``hfence_vvma_o`` output is tied to 0 -| ``hfence_gvma_o`` output is tied to 0 - diff --git a/docs/04_cv32a65x/design/source/port_compressed_decoder.rst b/docs/04_cv32a65x/design/source/port_compressed_decoder.rst deleted file mode 100644 index 4a5cdb30cf..0000000000 --- a/docs/04_cv32a65x/design/source/port_compressed_decoder.rst +++ /dev/null @@ -1,51 +0,0 @@ -.. - Copyright 2024 Thales DIS France SAS - Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); - you may not use this file except in compliance with the License. - SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 - You may obtain a copy of the License at https://solderpad.org/licenses/ - - Original Author: Jean-Roch COULON - Thales - -.. _CVA6_compressed_decoder_ports: - -.. list-table:: **compressed_decoder module** IO ports - :header-rows: 1 - - * - Signal - - IO - - Description - - connexion - - Type - - * - ``instr_i`` - - in - - Input instruction coming from fetch stage - - FRONTEND - - logic[31:0] - - * - ``instr_o`` - - out - - Output instruction in uncompressed format - - decoder - - logic[31:0] - - * - ``illegal_instr_o`` - - out - - Input instruction is illegal - - decoder - - logic - - * - ``is_macro_instr_o`` - - out - - Output instruction is macro - - decoder - - logic - - * - ``is_compressed_o`` - - out - - Output instruction is compressed - - decoder - - logic - - diff --git a/docs/04_cv32a65x/design/source/port_controller.rst b/docs/04_cv32a65x/design/source/port_controller.rst deleted file mode 100644 index 7569ab898d..0000000000 --- a/docs/04_cv32a65x/design/source/port_controller.rst +++ /dev/null @@ -1,149 +0,0 @@ -.. - Copyright 2024 Thales DIS France SAS - Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); - you may not use this file except in compliance with the License. - SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 - You may obtain a copy of the License at https://solderpad.org/licenses/ - - Original Author: Jean-Roch COULON - Thales - -.. _CVA6_controller_ports: - -.. list-table:: **controller module** IO ports - :header-rows: 1 - - * - Signal - - IO - - Description - - connexion - - Type - - * - ``clk_i`` - - in - - Subsystem Clock - - SUBSYSTEM - - logic - - * - ``rst_ni`` - - in - - Asynchronous reset active low - - SUBSYSTEM - - logic - - * - ``set_pc_commit_o`` - - out - - Set PC om PC Gen - - FRONTEND - - logic - - * - ``flush_if_o`` - - out - - Flush the IF stage - - FRONTEND - - logic - - * - ``flush_unissued_instr_o`` - - out - - Flush un-issued instructions of the scoreboard - - FRONTEND - - logic - - * - ``flush_id_o`` - - out - - Flush ID stage - - ID_STAGE - - logic - - * - ``flush_ex_o`` - - out - - Flush EX stage - - EX_STAGE - - logic - - * - ``flush_bp_o`` - - out - - Flush branch predictors - - FRONTEND - - logic - - * - ``flush_icache_o`` - - out - - Flush ICache - - CACHE - - logic - - * - ``flush_dcache_o`` - - out - - Flush DCache - - CACHE - - logic - - * - ``flush_dcache_ack_i`` - - in - - Acknowledge the whole DCache Flush - - CACHE - - logic - - * - ``halt_csr_i`` - - in - - Halt request from CSR (WFI instruction) - - CSR_REGFILE - - logic - - * - ``halt_o`` - - out - - Halt signal to commit stage - - COMMIT_STAGE - - logic - - * - ``eret_i`` - - in - - Return from exception - - CSR_REGFILE - - logic - - * - ``ex_valid_i`` - - in - - We got an exception, flush the pipeline - - FRONTEND - - logic - - * - ``resolved_branch_i`` - - in - - We got a resolved branch, check if we need to flush the front-end - - EX_STAGE - - bp_resolve_t - - * - ``flush_csr_i`` - - in - - We got an instruction which altered the CSR, flush the pipeline - - CSR_REGFILE - - logic - - * - ``flush_commit_i`` - - in - - Flush request from commit stage - - COMMIT_STAGE - - logic - -Due to cv32a65x configuration, some ports are tied to a static value. These ports do not appear in the above table, they are listed below - -| As RVH = False, -| ``v_i`` input is tied to 0 -| ``flush_tlb_vvma_o`` output is tied to 0 -| ``flush_tlb_gvma_o`` output is tied to 0 -| ``hfence_vvma_i`` input is tied to 0 -| ``hfence_gvma_i`` input is tied to 0 -| As MMUPresent = 0, -| ``flush_tlb_o`` output is tied to 0 -| As EnableAccelerator = 0, -| ``halt_acc_i`` input is tied to 0 -| ``flush_acc_i`` input is tied to 0 -| As DebugEn = False, -| ``set_debug_pc_i`` input is tied to 0 -| As FenceEn = 0, -| ``fence_i_i`` input is tied to 0 -| ``fence_i`` input is tied to 0 -| As RVS = False, -| ``sfence_vma_i`` input is tied to 0 - diff --git a/docs/04_cv32a65x/design/source/port_csr_buffer.rst b/docs/04_cv32a65x/design/source/port_csr_buffer.rst deleted file mode 100644 index e83ad5e34a..0000000000 --- a/docs/04_cv32a65x/design/source/port_csr_buffer.rst +++ /dev/null @@ -1,75 +0,0 @@ -.. - Copyright 2024 Thales DIS France SAS - Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); - you may not use this file except in compliance with the License. - SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 - You may obtain a copy of the License at https://solderpad.org/licenses/ - - Original Author: Jean-Roch COULON - Thales - -.. _CVA6_csr_buffer_ports: - -.. list-table:: **csr_buffer module** IO ports - :header-rows: 1 - - * - Signal - - IO - - Description - - connexion - - Type - - * - ``clk_i`` - - in - - Subsystem Clock - - SUBSYSTEM - - logic - - * - ``rst_ni`` - - in - - Asynchronous reset active low - - SUBSYSTEM - - logic - - * - ``flush_i`` - - in - - Flush CSR - - CONTROLLER - - logic - - * - ``fu_data_i`` - - in - - FU data needed to execute instruction - - ISSUE_STAGE - - fu_data_t - - * - ``csr_ready_o`` - - out - - CSR FU is ready - - ISSUE_STAGE - - logic - - * - ``csr_valid_i`` - - in - - CSR instruction is valid - - ISSUE_STAGE - - logic - - * - ``csr_result_o`` - - out - - CSR buffer result - - ISSUE_STAGE - - logic[CVA6Cfg.XLEN-1:0] - - * - ``csr_commit_i`` - - in - - commit the pending CSR OP - - TO_BE_COMPLETED - - logic - - * - ``csr_addr_o`` - - out - - CSR address to write - - COMMIT_STAGE - - logic[11:0] - - diff --git a/docs/04_cv32a65x/design/source/port_csr_regfile.rst b/docs/04_cv32a65x/design/source/port_csr_regfile.rst deleted file mode 100644 index 7eb94e58fd..0000000000 --- a/docs/04_cv32a65x/design/source/port_csr_regfile.rst +++ /dev/null @@ -1,228 +0,0 @@ -.. - Copyright 2024 Thales DIS France SAS - Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); - you may not use this file except in compliance with the License. - SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 - You may obtain a copy of the License at https://solderpad.org/licenses/ - - Original Author: Jean-Roch COULON - Thales - -.. _CVA6_csr_regfile_ports: - -.. list-table:: **csr_regfile module** IO ports - :header-rows: 1 - - * - Signal - - IO - - Description - - connexion - - Type - - * - ``clk_i`` - - in - - Subsystem Clock - - SUBSYSTEM - - logic - - * - ``rst_ni`` - - in - - Asynchronous reset active low - - SUBSYSTEM - - logic - - * - ``time_irq_i`` - - in - - Timer threw a interrupt - - SUBSYSTEM - - logic - - * - ``flush_o`` - - out - - send a flush request out when a CSR with a side effect changes - - CONTROLLER - - logic - - * - ``halt_csr_o`` - - out - - halt requested - - CONTROLLER - - logic - - * - ``commit_instr_i`` - - in - - Instruction to be committed - - ID_STAGE - - scoreboard_entry_t[CVA6Cfg.NrCommitPorts-1:0] - - * - ``commit_ack_i`` - - in - - Commit acknowledged a instruction -> increase instret CSR - - COMMIT_STAGE - - logic[CVA6Cfg.NrCommitPorts-1:0] - - * - ``boot_addr_i`` - - in - - Address from which to start booting, mtvec is set to the same address - - SUBSYSTEM - - logic[CVA6Cfg.VLEN-1:0] - - * - ``hart_id_i`` - - in - - Hart id in a multicore environment (reflected in a CSR) - - SUBSYSTEM - - logic[CVA6Cfg.XLEN-1:0] - - * - ``ex_i`` - - in - - We've got an exception from the commit stage, take it - - COMMIT_STAGE - - exception_t - - * - ``csr_op_i`` - - in - - Operation to perform on the CSR file - - COMMIT_STAGE - - fu_op - - * - ``csr_addr_i`` - - in - - Address of the register to read/write - - EX_STAGE - - logic[11:0] - - * - ``csr_wdata_i`` - - in - - Write data in - - COMMIT_STAGE - - logic[CVA6Cfg.XLEN-1:0] - - * - ``csr_rdata_o`` - - out - - Read data out - - COMMIT_STAGE - - logic[CVA6Cfg.XLEN-1:0] - - * - ``pc_i`` - - in - - PC of instruction accessing the CSR - - COMMIT_STAGE - - logic[CVA6Cfg.VLEN-1:0] - - * - ``csr_exception_o`` - - out - - attempts to access a CSR without appropriate privilege - - COMMIT_STAGE - - exception_t - - * - ``epc_o`` - - out - - Output the exception PC to PC Gen, the correct CSR (mepc, sepc) is set accordingly - - FRONTEND - - logic[CVA6Cfg.VLEN-1:0] - - * - ``eret_o`` - - out - - Return from exception, set the PC of epc_o - - FRONTEND - - logic - - * - ``trap_vector_base_o`` - - out - - Output base of exception vector, correct CSR is output (mtvec, stvec) - - FRONTEND - - logic[CVA6Cfg.VLEN-1:0] - - * - ``irq_ctrl_o`` - - out - - interrupt management to id stage - - ID_STAGE - - irq_ctrl_t - - * - ``irq_i`` - - in - - external interrupt in - - SUBSYSTEM - - logic[1:0] - - * - ``ipi_i`` - - in - - inter processor interrupt -> connected to machine mode sw - - SUBSYSTEM - - logic - - * - ``icache_en_o`` - - out - - L1 ICache Enable - - CACHE - - logic - - * - ``dcache_en_o`` - - out - - L1 DCache Enable - - CACHE - - logic - - * - ``rvfi_csr_o`` - - out - - none - - none - - rvfi_probes_csr_t - -Due to cv32a65x configuration, some ports are tied to a static value. These ports do not appear in the above table, they are listed below - -| As RVF = 0, -| ``dirty_fp_state_i`` input is tied to 0 -| ``csr_write_fflags_i`` input is tied to 0 -| ``fs_o`` output is tied to 0 -| ``fflags_o`` output is tied to 0 -| ``frm_o`` output is tied to 0 -| ``fprec_o`` output is tied to 0 -| As EnableAccelerator = 0, -| ``dirty_v_state_i`` input is tied to 0 -| ``acc_fflags_ex_i`` input is tied to 0 -| ``acc_fflags_ex_valid_i`` input is tied to 0 -| ``acc_cons_en_o`` output is tied to 0 -| ``pmpcfg_o`` output is tied to 0 -| ``pmpaddr_o`` output is tied to 0 -| As PRIV = MachineOnly, -| ``priv_lvl_o`` output is tied to MachineMode -| ``ld_st_priv_lvl_o`` output is tied to MAchineMode -| ``tvm_o`` output is tied to 0 -| ``tw_o`` output is tied to 0 -| ``tsr_o`` output is tied to 0 -| As RVH = False, -| ``v_o`` output is tied to 0 -| ``vfs_o`` output is tied to 0 -| ``en_g_translation_o`` output is tied to 0 -| ``en_ld_st_g_translation_o`` output is tied to 0 -| ``ld_st_v_o`` output is tied to 0 -| ``csr_hs_ld_st_inst_i`` input is tied to 0 -| ``vs_sum_o`` output is tied to 0 -| ``vmxr_o`` output is tied to 0 -| ``vsatp_ppn_o`` output is tied to 0 -| ``vs_asid_o`` output is tied to 0 -| ``hgatp_ppn_o`` output is tied to 0 -| ``vmid_o`` output is tied to 0 -| ``vtw_o`` output is tied to 0 -| ``hu_o`` output is tied to 0 -| As RVV = False, -| ``vs_o`` output is tied to 0 -| As RVS = False, -| ``en_translation_o`` output is tied to 0 -| ``en_ld_st_translation_o`` output is tied to 0 -| ``sum_o`` output is tied to 0 -| ``mxr_o`` output is tied to 0 -| ``satp_ppn_o`` output is tied to 0 -| ``asid_o`` output is tied to 0 -| As DebugEn = False, -| ``debug_req_i`` input is tied to 0 -| ``set_debug_pc_o`` output is tied to 0 -| ``debug_mode_o`` output is tied to 0 -| ``single_step_o`` output is tied to 0 -| As PerfCounterEn = 0, -| ``perf_addr_o`` output is tied to 0 -| ``perf_data_o`` output is tied to 0 -| ``perf_data_i`` input is tied to 0 -| ``perf_we_o`` output is tied to 0 -| ``mcountinhibit_o`` output is tied to 0 - diff --git a/docs/04_cv32a65x/design/source/port_cva6.rst b/docs/04_cv32a65x/design/source/port_cva6.rst deleted file mode 100644 index 5d9fa282aa..0000000000 --- a/docs/04_cv32a65x/design/source/port_cva6.rst +++ /dev/null @@ -1,93 +0,0 @@ -.. - Copyright 2024 Thales DIS France SAS - Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); - you may not use this file except in compliance with the License. - SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 - You may obtain a copy of the License at https://solderpad.org/licenses/ - - Original Author: Jean-Roch COULON - Thales - -.. _CVA6_cva6_ports: - -.. list-table:: **cva6 module** IO ports - :header-rows: 1 - - * - Signal - - IO - - Description - - connexion - - Type - - * - ``clk_i`` - - in - - Subsystem Clock - - SUBSYSTEM - - logic - - * - ``rst_ni`` - - in - - Asynchronous reset active low - - SUBSYSTEM - - logic - - * - ``boot_addr_i`` - - in - - Reset boot address - - SUBSYSTEM - - logic[CVA6Cfg.VLEN-1:0] - - * - ``hart_id_i`` - - in - - Hard ID reflected as CSR - - SUBSYSTEM - - logic[CVA6Cfg.XLEN-1:0] - - * - ``irq_i`` - - in - - Level sensitive (async) interrupts - - SUBSYSTEM - - logic[1:0] - - * - ``ipi_i`` - - in - - Inter-processor (async) interrupt - - SUBSYSTEM - - logic - - * - ``time_irq_i`` - - in - - Timer (async) interrupt - - SUBSYSTEM - - logic - - * - ``cvxif_req_o`` - - out - - CVXIF request - - SUBSYSTEM - - cvxif_req_t - - * - ``cvxif_resp_i`` - - in - - CVXIF response - - SUBSYSTEM - - cvxif_resp_t - - * - ``noc_req_o`` - - out - - noc request, can be AXI or OpenPiton - - SUBSYSTEM - - noc_req_t - - * - ``noc_resp_i`` - - in - - noc response, can be AXI or OpenPiton - - SUBSYSTEM - - noc_resp_t - -Due to cv32a65x configuration, some ports are tied to a static value. These ports do not appear in the above table, they are listed below - -| As DebugEn = False, -| ``debug_req_i`` input is tied to 0 -| As IsRVFI = 0, -| ``rvfi_probes_o`` output is tied to 0 - diff --git a/docs/04_cv32a65x/design/source/port_cva6_hpdcache_subsystem.rst b/docs/04_cv32a65x/design/source/port_cva6_hpdcache_subsystem.rst deleted file mode 100644 index 225dc34f64..0000000000 --- a/docs/04_cv32a65x/design/source/port_cva6_hpdcache_subsystem.rst +++ /dev/null @@ -1,153 +0,0 @@ -.. - Copyright 2024 Thales DIS France SAS - Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); - you may not use this file except in compliance with the License. - SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 - You may obtain a copy of the License at https://solderpad.org/licenses/ - - Original Author: Jean-Roch COULON - Thales - -.. _CVA6_cva6_hpdcache_subsystem_ports: - -.. list-table:: **cva6_hpdcache_subsystem module** IO ports - :header-rows: 1 - - * - Signal - - IO - - Description - - connexion - - Type - - * - ``clk_i`` - - in - - Subsystem Clock - - SUBSYSTEM - - logic - - * - ``rst_ni`` - - in - - Asynchronous reset active low - - SUBSYSTEM - - logic - - * - ``noc_req_o`` - - out - - noc request, can be AXI or OpenPiton - - SUBSYSTEM - - noc_req_t - - * - ``noc_resp_i`` - - in - - noc response, can be AXI or OpenPiton - - SUBSYSTEM - - noc_resp_t - - * - ``icache_en_i`` - - in - - Instruction cache enable - - CSR_REGFILE - - logic - - * - ``icache_flush_i`` - - in - - Flush the instruction cache - - CONTROLLER - - logic - - * - ``icache_areq_i`` - - in - - Input address translation request - - EX_STAGE - - icache_areq_t - - * - ``icache_areq_o`` - - out - - Output address translation request - - EX_STAGE - - icache_arsp_t - - * - ``icache_dreq_i`` - - in - - Input data translation request - - FRONTEND - - icache_dreq_t - - * - ``icache_dreq_o`` - - out - - Output data translation request - - FRONTEND - - icache_drsp_t - - * - ``dcache_enable_i`` - - in - - Data cache enable - - CSR_REGFILE - - logic - - * - ``dcache_flush_i`` - - in - - Data cache flush - - CONTROLLER - - logic - - * - ``dcache_flush_ack_o`` - - out - - Flush acknowledge - - CONTROLLER - - logic - - * - ``dcache_amo_req_i`` - - in - - AMO request - - EX_STAGE - - ariane_pkg::amo_req_t - - * - ``dcache_amo_resp_o`` - - out - - AMO response - - EX_STAGE - - ariane_pkg::amo_resp_t - - * - ``dcache_req_ports_i`` - - in - - Data cache input request ports - - EX_STAGE - - dcache_req_i_t[NumPorts-1:0] - - * - ``dcache_req_ports_o`` - - out - - Data cache output request ports - - EX_STAGE - - dcache_req_o_t[NumPorts-1:0] - - * - ``wbuffer_empty_o`` - - out - - Write buffer status to know if empty - - EX_STAGE - - logic - - * - ``wbuffer_not_ni_o`` - - out - - Write buffer status to know if not non idempotent - - EX_STAGE - - logic - -Due to cv32a65x configuration, some ports are tied to a static value. These ports do not appear in the above table, they are listed below - -| As PerfCounterEn = 0, -| ``icache_miss_o`` output is tied to 0 -| ``dcache_miss_o`` output is tied to 0 -| For any HW configuration, -| ``dcache_cmo_req_i`` input is tied to 0 -| ``dcache_cmo_resp_o`` output is tied to open -| ``hwpf_base_set_i`` input is tied to 0 -| ``hwpf_base_i`` input is tied to 0 -| ``hwpf_base_o`` output is tied to 0 -| ``hwpf_param_set_i`` input is tied to 0 -| ``hwpf_param_i`` input is tied to 0 -| ``hwpf_param_o`` output is tied to 0 -| ``hwpf_throttle_set_i`` input is tied to 0 -| ``hwpf_throttle_i`` input is tied to 0 -| ``hwpf_throttle_o`` output is tied to 0 -| ``hwpf_status_o`` output is tied to 0 - diff --git a/docs/04_cv32a65x/design/source/port_cvxif_fu.rst b/docs/04_cv32a65x/design/source/port_cvxif_fu.rst deleted file mode 100644 index 7e481a15a4..0000000000 --- a/docs/04_cv32a65x/design/source/port_cvxif_fu.rst +++ /dev/null @@ -1,103 +0,0 @@ -.. - Copyright 2024 Thales DIS France SAS - Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); - you may not use this file except in compliance with the License. - SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 - You may obtain a copy of the License at https://solderpad.org/licenses/ - - Original Author: Jean-Roch COULON - Thales - -.. _CVA6_cvxif_fu_ports: - -.. list-table:: **cvxif_fu module** IO ports - :header-rows: 1 - - * - Signal - - IO - - Description - - connexion - - Type - - * - ``clk_i`` - - in - - Subsystem Clock - - SUBSYSTEM - - logic - - * - ``rst_ni`` - - in - - Asynchronous reset active low - - SUBSYSTEM - - logic - - * - ``fu_data_i`` - - in - - FU data needed to execute instruction - - ISSUE_STAGE - - fu_data_t - - * - ``x_valid_i`` - - in - - CVXIF instruction is valid - - ISSUE_STAGE - - logic - - * - ``x_ready_o`` - - out - - CVXIF is ready - - ISSUE_STAGE - - logic - - * - ``x_off_instr_i`` - - in - - Offloaded instruction - - ISSUE_STAGE - - logic[31:0] - - * - ``x_trans_id_o`` - - out - - CVXIF transaction ID - - ISSUE_STAGE - - logic[CVA6Cfg.TRANS_ID_BITS-1:0] - - * - ``x_exception_o`` - - out - - CVXIF exception - - ISSUE_STAGE - - exception_t - - * - ``x_result_o`` - - out - - CVXIF FU result - - ISSUE_STAGE - - logic[CVA6Cfg.XLEN-1:0] - - * - ``x_valid_o`` - - out - - CVXIF result valid - - ISSUE_STAGE - - logic - - * - ``x_we_o`` - - out - - CVXIF write enable - - ISSUE_STAGE - - logic - - * - ``cvxif_req_o`` - - out - - CVXIF request - - SUBSYSTEM - - cvxif_pkg::cvxif_req_t - - * - ``cvxif_resp_i`` - - in - - CVXIF response - - SUBSYSTEM - - cvxif_pkg::cvxif_resp_t - -Due to cv32a65x configuration, some ports are tied to a static value. These ports do not appear in the above table, they are listed below - -| As PRIV = MachineOnly, -| ``priv_lvl_i`` input is tied to MachineMode - diff --git a/docs/04_cv32a65x/design/source/port_decoder.rst b/docs/04_cv32a65x/design/source/port_decoder.rst deleted file mode 100644 index 424dd7861d..0000000000 --- a/docs/04_cv32a65x/design/source/port_decoder.rst +++ /dev/null @@ -1,131 +0,0 @@ -.. - Copyright 2024 Thales DIS France SAS - Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); - you may not use this file except in compliance with the License. - SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 - You may obtain a copy of the License at https://solderpad.org/licenses/ - - Original Author: Jean-Roch COULON - Thales - -.. _CVA6_decoder_ports: - -.. list-table:: **decoder module** IO ports - :header-rows: 1 - - * - Signal - - IO - - Description - - connexion - - Type - - * - ``pc_i`` - - in - - PC from fetch stage - - FRONTEND - - logic[CVA6Cfg.VLEN-1:0] - - * - ``is_compressed_i`` - - in - - Is a compressed instruction - - compressed_decoder - - logic - - * - ``compressed_instr_i`` - - in - - Compressed form of instruction - - FRONTEND - - logic[15:0] - - * - ``is_illegal_i`` - - in - - Illegal compressed instruction - - compressed_decoder - - logic - - * - ``instruction_i`` - - in - - Instruction from fetch stage - - FRONTEND - - logic[31:0] - - * - ``is_macro_instr_i`` - - in - - Is a macro instruction - - macro_decoder - - logic - - * - ``is_last_macro_instr_i`` - - in - - Is a last macro instruction - - macro_decoder - - logic - - * - ``is_double_rd_macro_instr_i`` - - in - - Is mvsa01/mva01s macro instruction - - macro_decoder - - logic - - * - ``branch_predict_i`` - - in - - Is a branch predict instruction - - FRONTEND - - branchpredict_sbe_t - - * - ``ex_i`` - - in - - If an exception occured in fetch stage - - FRONTEND - - exception_t - - * - ``irq_i`` - - in - - Level sensitive (async) interrupts - - SUBSYSTEM - - logic[1:0] - - * - ``irq_ctrl_i`` - - in - - Interrupt control status - - CSR_REGFILE - - irq_ctrl_t - - * - ``instruction_o`` - - out - - Instruction to be added to scoreboard entry - - ISSUE_STAGE - - scoreboard_entry_t - - * - ``orig_instr_o`` - - out - - Instruction - - ISSUE_STAGE - - logic[31:0] - - * - ``is_control_flow_instr_o`` - - out - - Is a control flow instruction - - ISSUE_STAGE - - logic - -Due to cv32a65x configuration, some ports are tied to a static value. These ports do not appear in the above table, they are listed below - -| As DebugEn = False, -| ``debug_req_i`` input is tied to 0 -| ``debug_mode_i`` input is tied to 0 -| As PRIV = MachineOnly, -| ``priv_lvl_i`` input is tied to MachineMode -| ``tvm_i`` input is tied to 0 -| ``tw_i`` input is tied to 0 -| ``tsr_i`` input is tied to 0 -| As RVH = False, -| ``v_i`` input is tied to 0 -| ``vfs_i`` input is tied to 0 -| ``vtw_i`` input is tied to 0 -| ``hu_i`` input is tied to 0 -| As RVF = 0, -| ``fs_i`` input is tied to 0 -| ``frm_i`` input is tied to 0 -| As RVV = False, -| ``vs_i`` input is tied to 0 - diff --git a/docs/04_cv32a65x/design/source/port_ex_stage.rst b/docs/04_cv32a65x/design/source/port_ex_stage.rst deleted file mode 100644 index 91d59791a9..0000000000 --- a/docs/04_cv32a65x/design/source/port_ex_stage.rst +++ /dev/null @@ -1,400 +0,0 @@ -.. - Copyright 2024 Thales DIS France SAS - Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); - you may not use this file except in compliance with the License. - SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 - You may obtain a copy of the License at https://solderpad.org/licenses/ - - Original Author: Jean-Roch COULON - Thales - -.. _CVA6_ex_stage_ports: - -.. list-table:: **ex_stage module** IO ports - :header-rows: 1 - - * - Signal - - IO - - Description - - connexion - - Type - - * - ``clk_i`` - - in - - Subsystem Clock - - SUBSYSTEM - - logic - - * - ``rst_ni`` - - in - - Asynchronous reset active low - - SUBSYSTEM - - logic - - * - ``flush_i`` - - in - - Fetch flush request - - CONTROLLER - - logic - - * - ``rs1_forwarding_i`` - - in - - rs1 forwarding - - ISSUE_STAGE - - logic[CVA6Cfg.VLEN-1:0] - - * - ``rs2_forwarding_i`` - - in - - rs2 forwarding - - ISSUE_STAGE - - logic[CVA6Cfg.VLEN-1:0] - - * - ``fu_data_i`` - - in - - FU data useful to execute instruction - - ISSUE_STAGE - - fu_data_t - - * - ``pc_i`` - - in - - PC of the current instruction - - ISSUE_STAGE - - logic[CVA6Cfg.VLEN-1:0] - - * - ``is_compressed_instr_i`` - - in - - Report whether isntruction is compressed - - ISSUE_STAGE - - logic - - * - ``flu_result_o`` - - out - - Fixed Latency Unit result - - ISSUE_STAGE - - logic[CVA6Cfg.XLEN-1:0] - - * - ``flu_trans_id_o`` - - out - - ID of the scoreboard entry at which a=to write back - - ISSUE_STAGE - - logic[CVA6Cfg.TRANS_ID_BITS-1:0] - - * - ``flu_exception_o`` - - out - - Fixed Latency Unit exception - - ISSUE_STAGE - - exception_t - - * - ``flu_ready_o`` - - out - - FLU is ready - - ISSUE_STAGE - - logic - - * - ``flu_valid_o`` - - out - - FLU result is valid - - ISSUE_STAGE - - logic - - * - ``alu_valid_i`` - - in - - ALU instruction is valid - - ISSUE_STAGE - - logic - - * - ``branch_valid_i`` - - in - - Branch unit instruction is valid - - ISSUE_STAGE - - logic - - * - ``branch_predict_i`` - - in - - Information of branch prediction - - ISSUE_STAGE - - branchpredict_sbe_t - - * - ``resolved_branch_o`` - - out - - The branch engine uses the write back from the ALU - - several_modules - - bp_resolve_t - - * - ``resolve_branch_o`` - - out - - Signaling that we resolved the branch - - ISSUE_STAGE - - logic - - * - ``csr_valid_i`` - - in - - CSR instruction is valid - - ISSUE_STAGE - - logic - - * - ``csr_addr_o`` - - out - - CSR address to write - - COMMIT_STAGE - - logic[11:0] - - * - ``csr_commit_i`` - - in - - CSR commit - - COMMIT_STAGE - - logic - - * - ``mult_valid_i`` - - in - - MULT instruction is valid - - ISSUE_STAGE - - logic - - * - ``lsu_ready_o`` - - out - - LSU is ready - - ISSUE_STAGE - - logic - - * - ``lsu_valid_i`` - - in - - LSU instruction is valid - - ISSUE_STAGE - - logic - - * - ``load_valid_o`` - - out - - Load result is valid - - ISSUE_STAGE - - logic - - * - ``load_result_o`` - - out - - Load result valid - - ISSUE_STAGE - - logic[CVA6Cfg.XLEN-1:0] - - * - ``load_trans_id_o`` - - out - - Load instruction ID - - ISSUE_STAGE - - logic[CVA6Cfg.TRANS_ID_BITS-1:0] - - * - ``load_exception_o`` - - out - - Exception generated by load instruction - - ISSUE_STAGE - - exception_t - - * - ``store_valid_o`` - - out - - Store result is valid - - ISSUe_STAGE - - logic - - * - ``store_result_o`` - - out - - Store result - - ISSUE_STAGE - - logic[CVA6Cfg.XLEN-1:0] - - * - ``store_trans_id_o`` - - out - - Store instruction ID - - ISSUE_STAGE - - logic[CVA6Cfg.TRANS_ID_BITS-1:0] - - * - ``store_exception_o`` - - out - - Exception generated by store instruction - - ISSUE_STAGE - - exception_t - - * - ``lsu_commit_i`` - - in - - LSU commit - - COMMIT_STAGE - - logic - - * - ``lsu_commit_ready_o`` - - out - - Commit queue ready to accept another commit request - - COMMIT_STAGE - - logic - - * - ``commit_tran_id_i`` - - in - - Commit transaction ID - - COMMIT_STAGE - - logic[CVA6Cfg.TRANS_ID_BITS-1:0] - - * - ``no_st_pending_o`` - - out - - TO_BE_COMPLETED - - COMMIT_STAGE - - logic - - * - ``x_valid_i`` - - in - - CVXIF instruction is valid - - ISSUE_STAGE - - logic - - * - ``x_ready_o`` - - out - - CVXIF is ready - - ISSUE_STAGE - - logic - - * - ``x_off_instr_i`` - - in - - undecoded instruction - - ISSUE_STAGE - - logic[31:0] - - * - ``x_trans_id_o`` - - out - - CVXIF transaction ID - - ISSUE_STAGE - - logic[CVA6Cfg.TRANS_ID_BITS-1:0] - - * - ``x_exception_o`` - - out - - CVXIF exception - - ISSUE_STAGE - - exception_t - - * - ``x_result_o`` - - out - - CVXIF result - - ISSUE_STAGE - - logic[CVA6Cfg.XLEN-1:0] - - * - ``x_valid_o`` - - out - - CVXIF result valid - - ISSUE_STAGE - - logic - - * - ``x_we_o`` - - out - - CVXIF write enable - - ISSUE_STAGE - - logic - - * - ``cvxif_req_o`` - - out - - CVXIF request - - SUBSYSTEM - - cvxif_pkg::cvxif_req_t - - * - ``cvxif_resp_i`` - - in - - CVXIF response - - SUBSYSTEM - - cvxif_pkg::cvxif_resp_t - - * - ``icache_areq_i`` - - in - - icache translation response - - CACHE - - icache_arsp_t - - * - ``icache_areq_o`` - - out - - icache translation request - - CACHE - - icache_areq_t - - * - ``dcache_req_ports_i`` - - in - - Data cache request ouput - - CACHE - - dcache_req_o_t[2:0] - - * - ``dcache_req_ports_o`` - - out - - Data cache request input - - CACHE - - dcache_req_i_t[2:0] - - * - ``dcache_wbuffer_empty_i`` - - in - - Write buffer is empty - - CACHE - - logic - - * - ``dcache_wbuffer_not_ni_i`` - - in - - TO_BE_COMPLETED - - CACHE - - logic - - * - ``pmpcfg_i`` - - in - - Report the PMP configuration - - CSR_REGFILE - - riscv::pmpcfg_t[15:0] - - * - ``pmpaddr_i`` - - in - - Report the PMP addresses - - CSR_REGFILE - - logic[15:0][CVA6Cfg.PLEN-3:0] - -Due to cv32a65x configuration, some ports are tied to a static value. These ports do not appear in the above table, they are listed below - -| As DebugEn = False, -| ``debug_mode_i`` input is tied to 0 -| As RVH = False, -| ``tinst_i`` input is tied to 0 -| ``enable_g_translation_i`` input is tied to 0 -| ``en_ld_st_g_translation_i`` input is tied to 0 -| ``flush_tlb_vvma_i`` input is tied to 0 -| ``flush_tlb_gvma_i`` input is tied to 0 -| ``v_i`` input is tied to 0 -| ``ld_st_v_i`` input is tied to 0 -| ``csr_hs_ld_st_inst_o`` output is tied to 0 -| ``vs_sum_i`` input is tied to 0 -| ``vmxr_i`` input is tied to 0 -| ``vsatp_ppn_i`` input is tied to 0 -| ``vs_asid_i`` input is tied to 0 -| ``hgatp_ppn_i`` input is tied to 0 -| ``vmid_i`` input is tied to 0 -| As EnableAccelerator = 0, -| ``stall_st_pending_i`` input is tied to 0 -| ``acc_valid_i`` input is tied to 0 -| As RVA = False, -| ``amo_valid_commit_i`` input is tied to 0 -| ``amo_req_o`` output is tied to 0 -| ``amo_resp_i`` input is tied to 0 -| As RVF = 0, -| ``fpu_ready_o`` output is tied to 0 -| ``fpu_valid_i`` input is tied to 0 -| ``fpu_fmt_i`` input is tied to 0 -| ``fpu_rm_i`` input is tied to 0 -| ``fpu_frm_i`` input is tied to 0 -| ``fpu_prec_i`` input is tied to 0 -| ``fpu_trans_id_o`` output is tied to 0 -| ``fpu_result_o`` output is tied to 0 -| ``fpu_valid_o`` output is tied to 0 -| ``fpu_exception_o`` output is tied to 0 -| As RVS = False, -| ``enable_translation_i`` input is tied to 0 -| ``en_ld_st_translation_i`` input is tied to 0 -| ``sum_i`` input is tied to 0 -| ``mxr_i`` input is tied to 0 -| ``satp_ppn_i`` input is tied to 0 -| ``asid_i`` input is tied to 0 -| As MMUPresent = 0, -| ``flush_tlb_i`` input is tied to 0 -| As PRIV = MachineOnly, -| ``priv_lvl_i`` input is tied to MachineMode -| ``ld_st_priv_lvl_i`` input is tied to MAchineMode -| As PerfCounterEn = 0, -| ``itlb_miss_o`` output is tied to 0 -| ``dtlb_miss_o`` output is tied to 0 -| As IsRVFI = 0, -| ``rvfi_lsu_ctrl_o`` output is tied to 0 -| ``rvfi_mem_paddr_o`` output is tied to 0 - diff --git a/docs/04_cv32a65x/design/source/port_frontend.rst b/docs/04_cv32a65x/design/source/port_frontend.rst deleted file mode 100644 index f92b71fdb9..0000000000 --- a/docs/04_cv32a65x/design/source/port_frontend.rst +++ /dev/null @@ -1,130 +0,0 @@ -.. - Copyright 2024 Thales DIS France SAS - Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); - you may not use this file except in compliance with the License. - SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 - You may obtain a copy of the License at https://solderpad.org/licenses/ - - Original Author: Jean-Roch COULON - Thales - -.. _CVA6_frontend_ports: - -.. list-table:: **frontend module** IO ports - :header-rows: 1 - - * - Signal - - IO - - Description - - connexion - - Type - - * - ``clk_i`` - - in - - Subsystem Clock - - SUBSYSTEM - - logic - - * - ``rst_ni`` - - in - - Asynchronous reset active low - - SUBSYSTEM - - logic - - * - ``boot_addr_i`` - - in - - Next PC when reset - - SUBSYSTEM - - logic[CVA6Cfg.VLEN-1:0] - - * - ``flush_i`` - - in - - Flush requested by FENCE, mis-predict and exception - - CONTROLLER - - logic - - * - ``halt_i`` - - in - - Halt requested by WFI and Accelerate port - - CONTROLLER - - logic - - * - ``set_pc_commit_i`` - - in - - Set COMMIT PC as next PC requested by FENCE, CSR side-effect and Accelerate port - - CONTROLLER - - logic - - * - ``pc_commit_i`` - - in - - COMMIT PC - - COMMIT - - logic[CVA6Cfg.VLEN-1:0] - - * - ``ex_valid_i`` - - in - - Exception event - - COMMIT - - logic - - * - ``resolved_branch_i`` - - in - - Mispredict event and next PC - - EXECUTE - - bp_resolve_t - - * - ``eret_i`` - - in - - Return from exception event - - CSR - - logic - - * - ``epc_i`` - - in - - Next PC when returning from exception - - CSR - - logic[CVA6Cfg.VLEN-1:0] - - * - ``trap_vector_base_i`` - - in - - Next PC when jumping into exception - - CSR - - logic[CVA6Cfg.VLEN-1:0] - - * - ``icache_dreq_o`` - - out - - Handshake between CACHE and FRONTEND (fetch) - - CACHES - - icache_dreq_t - - * - ``icache_dreq_i`` - - in - - Handshake between CACHE and FRONTEND (fetch) - - CACHES - - icache_drsp_t - - * - ``fetch_entry_o`` - - out - - Handshake's data between fetch and decode - - ID_STAGE - - fetch_entry_t[ariane_pkg::SUPERSCALAR:0] - - * - ``fetch_entry_valid_o`` - - out - - Handshake's valid between fetch and decode - - ID_STAGE - - logic[ariane_pkg::SUPERSCALAR:0] - - * - ``fetch_entry_ready_i`` - - in - - Handshake's ready between fetch and decode - - ID_STAGE - - logic[ariane_pkg::SUPERSCALAR:0] - -Due to cv32a65x configuration, some ports are tied to a static value. These ports do not appear in the above table, they are listed below - -| For any HW configuration, -| ``flush_bp_i`` input is tied to 0 -| As DebugEn = False, -| ``set_debug_pc_i`` input is tied to 0 -| ``debug_mode_i`` input is tied to 0 - diff --git a/docs/04_cv32a65x/design/source/port_id_stage.rst b/docs/04_cv32a65x/design/source/port_id_stage.rst deleted file mode 100644 index 687850512a..0000000000 --- a/docs/04_cv32a65x/design/source/port_id_stage.rst +++ /dev/null @@ -1,121 +0,0 @@ -.. - Copyright 2024 Thales DIS France SAS - Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); - you may not use this file except in compliance with the License. - SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 - You may obtain a copy of the License at https://solderpad.org/licenses/ - - Original Author: Jean-Roch COULON - Thales - -.. _CVA6_id_stage_ports: - -.. list-table:: **id_stage module** IO ports - :header-rows: 1 - - * - Signal - - IO - - Description - - connexion - - Type - - * - ``clk_i`` - - in - - Subsystem Clock - - SUBSYSTEM - - logic - - * - ``rst_ni`` - - in - - Asynchronous reset active low - - SUBSYSTEM - - logic - - * - ``flush_i`` - - in - - Fetch flush request - - CONTROLLER - - logic - - * - ``fetch_entry_i`` - - in - - Handshake's data between fetch and decode - - FRONTEND - - fetch_entry_t[ariane_pkg::SUPERSCALAR:0] - - * - ``fetch_entry_valid_i`` - - in - - Handshake's valid between fetch and decode - - FRONTEND - - logic[ariane_pkg::SUPERSCALAR:0] - - * - ``fetch_entry_ready_o`` - - out - - Handshake's ready between fetch and decode - - FRONTEND - - logic[ariane_pkg::SUPERSCALAR:0] - - * - ``issue_entry_o`` - - out - - Handshake's data between decode and issue - - ISSUE - - scoreboard_entry_t[ariane_pkg::SUPERSCALAR:0] - - * - ``orig_instr_o`` - - out - - Instruction value - - ISSUE - - logic[ariane_pkg::SUPERSCALAR:0][31:0] - - * - ``issue_entry_valid_o`` - - out - - Handshake's valid between decode and issue - - ISSUE - - logic[ariane_pkg::SUPERSCALAR:0] - - * - ``is_ctrl_flow_o`` - - out - - Report if instruction is a control flow instruction - - ISSUE - - logic[ariane_pkg::SUPERSCALAR:0] - - * - ``issue_instr_ack_i`` - - in - - Handshake's acknowlege between decode and issue - - ISSUE - - logic[ariane_pkg::SUPERSCALAR:0] - - * - ``irq_i`` - - in - - Level sensitive (async) interrupts - - SUBSYSTEM - - logic[1:0] - - * - ``irq_ctrl_i`` - - in - - Interrupt control status - - CSR_REGFILE - - irq_ctrl_t - -Due to cv32a65x configuration, some ports are tied to a static value. These ports do not appear in the above table, they are listed below - -| As DebugEn = False, -| ``debug_req_i`` input is tied to 0 -| ``debug_mode_i`` input is tied to 0 -| As IsRVFI = 0, -| ``rvfi_is_compressed_o`` output is tied to 0 -| As PRIV = MachineOnly, -| ``priv_lvl_i`` input is tied to MachineMode -| ``tvm_i`` input is tied to 0 -| ``tw_i`` input is tied to 0 -| ``tsr_i`` input is tied to 0 -| As RVH = False, -| ``v_i`` input is tied to 0 -| ``vfs_i`` input is tied to 0 -| ``vtw_i`` input is tied to 0 -| ``hu_i`` input is tied to 0 -| As RVF = 0, -| ``fs_i`` input is tied to 0 -| ``frm_i`` input is tied to 0 -| As RVV = False, -| ``vs_i`` input is tied to 0 - diff --git a/docs/04_cv32a65x/design/source/port_instr_queue.rst b/docs/04_cv32a65x/design/source/port_instr_queue.rst deleted file mode 100644 index b5a73da1a3..0000000000 --- a/docs/04_cv32a65x/design/source/port_instr_queue.rst +++ /dev/null @@ -1,129 +0,0 @@ -.. - Copyright 2024 Thales DIS France SAS - Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); - you may not use this file except in compliance with the License. - SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 - You may obtain a copy of the License at https://solderpad.org/licenses/ - - Original Author: Jean-Roch COULON - Thales - -.. _CVA6_instr_queue_ports: - -.. list-table:: **instr_queue module** IO ports - :header-rows: 1 - - * - Signal - - IO - - Description - - connexion - - Type - - * - ``clk_i`` - - in - - Subsystem Clock - - SUBSYSTEM - - logic - - * - ``rst_ni`` - - in - - Asynchronous reset active low - - SUBSYSTEM - - logic - - * - ``flush_i`` - - in - - Fetch flush request - - CONTROLLER - - logic - - * - ``instr_i`` - - in - - Instruction - - instr_realign - - logic[CVA6Cfg.INSTR_PER_FETCH-1:0][31:0] - - * - ``addr_i`` - - in - - Instruction address - - instr_realign - - logic[CVA6Cfg.INSTR_PER_FETCH-1:0][CVA6Cfg.VLEN-1:0] - - * - ``valid_i`` - - in - - Instruction is valid - - instr_realign - - logic[CVA6Cfg.INSTR_PER_FETCH-1:0] - - * - ``ready_o`` - - out - - Handshake’s ready with CACHE - - CACHE - - logic - - * - ``consumed_o`` - - out - - Indicates instructions consummed, or popped by ID_STAGE - - FRONTEND - - logic[CVA6Cfg.INSTR_PER_FETCH-1:0] - - * - ``exception_i`` - - in - - Exception (which is page-table fault) - - CACHE - - ariane_pkg::frontend_exception_t - - * - ``exception_addr_i`` - - in - - Exception address - - CACHE - - logic[CVA6Cfg.VLEN-1:0] - - * - ``predict_address_i`` - - in - - Branch predict - - FRONTEND - - logic[CVA6Cfg.VLEN-1:0] - - * - ``cf_type_i`` - - in - - Instruction predict address - - FRONTEND - - ariane_pkg::cf_t[CVA6Cfg.INSTR_PER_FETCH-1:0] - - * - ``replay_o`` - - out - - Replay instruction because one of the FIFO was full - - FRONTEND - - logic - - * - ``replay_addr_o`` - - out - - Address at which to replay the fetch - - FRONTEND - - logic[CVA6Cfg.VLEN-1:0] - - * - ``fetch_entry_o`` - - out - - Handshake’s data with ID_STAGE - - ID_STAGE - - fetch_entry_t[ariane_pkg::SUPERSCALAR:0] - - * - ``fetch_entry_valid_o`` - - out - - Handshake’s valid with ID_STAGE - - ID_STAGE - - logic[ariane_pkg::SUPERSCALAR:0] - - * - ``fetch_entry_ready_i`` - - in - - Handshake’s ready with ID_STAGE - - ID_STAGE - - logic[ariane_pkg::SUPERSCALAR:0] - -Due to cv32a65x configuration, some ports are tied to a static value. These ports do not appear in the above table, they are listed below - -| As RVH = False, -| ``exception_gpaddr_i`` input is tied to 0 -| ``exception_tinst_i`` input is tied to 0 -| ``exception_gva_i`` input is tied to 0 - diff --git a/docs/04_cv32a65x/design/source/port_instr_realign.rst b/docs/04_cv32a65x/design/source/port_instr_realign.rst deleted file mode 100644 index fef98d99e6..0000000000 --- a/docs/04_cv32a65x/design/source/port_instr_realign.rst +++ /dev/null @@ -1,81 +0,0 @@ -.. - Copyright 2024 Thales DIS France SAS - Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); - you may not use this file except in compliance with the License. - SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 - You may obtain a copy of the License at https://solderpad.org/licenses/ - - Original Author: Jean-Roch COULON - Thales - -.. _CVA6_instr_realign_ports: - -.. list-table:: **instr_realign module** IO ports - :header-rows: 1 - - * - Signal - - IO - - Description - - connexion - - Type - - * - ``clk_i`` - - in - - Subsystem Clock - - SUBSYSTEM - - logic - - * - ``rst_ni`` - - in - - Asynchronous reset active low - - SUBSYSTEM - - logic - - * - ``flush_i`` - - in - - Fetch flush request - - CONTROLLER - - logic - - * - ``valid_i`` - - in - - 32-bit block is valid - - CACHE - - logic - - * - ``serving_unaligned_o`` - - out - - Instruction is unaligned - - FRONTEND - - logic - - * - ``address_i`` - - in - - 32-bit block address - - CACHE - - logic[CVA6Cfg.VLEN-1:0] - - * - ``data_i`` - - in - - 32-bit block - - CACHE - - logic[CVA6Cfg.FETCH_WIDTH-1:0] - - * - ``valid_o`` - - out - - instruction is valid - - FRONTEND - - logic[CVA6Cfg.INSTR_PER_FETCH-1:0] - - * - ``addr_o`` - - out - - Instruction address - - FRONTEND - - logic[CVA6Cfg.INSTR_PER_FETCH-1:0][CVA6Cfg.VLEN-1:0] - - * - ``instr_o`` - - out - - Instruction - - instr_scan&instr_queue - - logic[CVA6Cfg.INSTR_PER_FETCH-1:0][31:0] - - diff --git a/docs/04_cv32a65x/design/source/port_instr_scan.rst b/docs/04_cv32a65x/design/source/port_instr_scan.rst deleted file mode 100644 index dbc877777e..0000000000 --- a/docs/04_cv32a65x/design/source/port_instr_scan.rst +++ /dev/null @@ -1,105 +0,0 @@ -.. - Copyright 2024 Thales DIS France SAS - Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); - you may not use this file except in compliance with the License. - SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 - You may obtain a copy of the License at https://solderpad.org/licenses/ - - Original Author: Jean-Roch COULON - Thales - -.. _CVA6_instr_scan_ports: - -.. list-table:: **instr_scan module** IO ports - :header-rows: 1 - - * - Signal - - IO - - Description - - connexion - - Type - - * - ``instr_i`` - - in - - Instruction to be predecoded - - instr_realign - - logic[31:0] - - * - ``rvi_return_o`` - - out - - Return instruction - - FRONTEND - - logic - - * - ``rvi_call_o`` - - out - - JAL instruction - - FRONTEND - - logic - - * - ``rvi_branch_o`` - - out - - Branch instruction - - FRONTEND - - logic - - * - ``rvi_jalr_o`` - - out - - JALR instruction - - FRONTEND - - logic - - * - ``rvi_jump_o`` - - out - - Unconditional jump instruction - - FRONTEND - - logic - - * - ``rvi_imm_o`` - - out - - Instruction immediat - - FRONTEND - - logic[CVA6Cfg.VLEN-1:0] - - * - ``rvc_branch_o`` - - out - - Branch compressed instruction - - FRONTEND - - logic - - * - ``rvc_jump_o`` - - out - - Unconditional jump compressed instruction - - FRONTEND - - logic - - * - ``rvc_jr_o`` - - out - - JR compressed instruction - - FRONTEND - - logic - - * - ``rvc_return_o`` - - out - - Return compressed instruction - - FRONTEND - - logic - - * - ``rvc_jalr_o`` - - out - - JALR compressed instruction - - FRONTEND - - logic - - * - ``rvc_call_o`` - - out - - JAL compressed instruction - - FRONTEND - - logic - - * - ``rvc_imm_o`` - - out - - Instruction compressed immediat - - FRONTEND - - logic[CVA6Cfg.VLEN-1:0] - - diff --git a/docs/04_cv32a65x/design/source/port_issue_read_operands.rst b/docs/04_cv32a65x/design/source/port_issue_read_operands.rst deleted file mode 100644 index d8f742ff93..0000000000 --- a/docs/04_cv32a65x/design/source/port_issue_read_operands.rst +++ /dev/null @@ -1,261 +0,0 @@ -.. - Copyright 2024 Thales DIS France SAS - Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); - you may not use this file except in compliance with the License. - SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 - You may obtain a copy of the License at https://solderpad.org/licenses/ - - Original Author: Jean-Roch COULON - Thales - -.. _CVA6_issue_read_operands_ports: - -.. list-table:: **issue_read_operands module** IO ports - :header-rows: 1 - - * - Signal - - IO - - Description - - connexion - - Type - - * - ``clk_i`` - - in - - Subsystem Clock - - SUBSYSTEM - - logic - - * - ``rst_ni`` - - in - - Asynchronous reset active low - - SUBSYSTEM - - logic - - * - ``flush_i`` - - in - - Flush - - CONTROLLER - - logic - - * - ``issue_instr_i`` - - in - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - scoreboard_entry_t - - * - ``orig_instr_i`` - - in - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic[31:0] - - * - ``issue_instr_valid_i`` - - in - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic - - * - ``issue_ack_o`` - - out - - Issue stage acknowledge - - TO_BE_COMPLETED - - logic - - * - ``rs1_o`` - - out - - rs1 operand address - - scoreboard - - logic[REG_ADDR_SIZE-1:0] - - * - ``rs1_i`` - - in - - rs1 operand - - scoreboard - - logic[CVA6Cfg.XLEN-1:0] - - * - ``rs1_valid_i`` - - in - - rs1 operand is valid - - scoreboard - - logic - - * - ``rs2_o`` - - out - - rs2 operand address - - scoreboard - - logic[REG_ADDR_SIZE-1:0] - - * - ``rs2_i`` - - in - - rs2 operand - - scoreboard - - logic[CVA6Cfg.XLEN-1:0] - - * - ``rs2_valid_i`` - - in - - rs2 operand is valid - - scoreboard - - logic - - * - ``rs3_o`` - - out - - rs3 operand address - - scoreboard - - logic[REG_ADDR_SIZE-1:0] - - * - ``rs3_i`` - - in - - rs3 operand - - scoreboard - - rs3_len_t - - * - ``rs3_valid_i`` - - in - - rs3 operand is valid - - scoreboard - - logic - - * - ``rd_clobber_gpr_i`` - - in - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - fu_t[2**REG_ADDR_SIZE-1:0] - - * - ``rd_clobber_fpr_i`` - - in - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - fu_t[2**REG_ADDR_SIZE-1:0] - - * - ``fu_data_o`` - - out - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - fu_data_t - - * - ``rs1_forwarding_o`` - - out - - Unregistered version of fu_data_o.operanda - - TO_BE_COMPLETED - - logic[CVA6Cfg.XLEN-1:0] - - * - ``rs2_forwarding_o`` - - out - - Unregistered version of fu_data_o.operandb - - TO_BE_COMPLETED - - logic[CVA6Cfg.XLEN-1:0] - - * - ``pc_o`` - - out - - Instruction pc - - TO_BE_COMPLETED - - logic[CVA6Cfg.VLEN-1:0] - - * - ``is_compressed_instr_o`` - - out - - Is compressed instruction - - TO_BE_COMPLETED - - logic - - * - ``flu_ready_i`` - - in - - Fixed Latency Unit ready to accept new request - - TO_BE_COMPLETED - - logic - - * - ``alu_valid_o`` - - out - - ALU output is valid - - TO_BE_COMPLETED - - logic - - * - ``branch_valid_o`` - - out - - Branch instruction is valid - - TO_BE_COMPLETED - - logic - - * - ``branch_predict_o`` - - out - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - branchpredict_sbe_t - - * - ``lsu_ready_i`` - - in - - Load Store Unit is ready - - TO_BE_COMPLETED - - logic - - * - ``lsu_valid_o`` - - out - - Load Store Unit result is valid - - TO_BE_COMPLETED - - logic - - * - ``mult_valid_o`` - - out - - Mult result is valid - - TO_BE_COMPLETED - - logic - - * - ``csr_valid_o`` - - out - - CSR result is valid - - TO_BE_COMPLETED - - logic - - * - ``cvxif_valid_o`` - - out - - CVXIF result is valid - - TO_BE_COMPLETED - - logic - - * - ``cvxif_ready_i`` - - in - - CVXIF is ready - - TO_BE_COMPLETED - - logic - - * - ``cvxif_off_instr_o`` - - out - - CVXIF offloaded instruction - - TO_BE_COMPLETED - - logic[31:0] - - * - ``waddr_i`` - - in - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic[CVA6Cfg.NrCommitPorts-1:0][4:0] - - * - ``wdata_i`` - - in - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic[CVA6Cfg.NrCommitPorts-1:0][CVA6Cfg.XLEN-1:0] - - * - ``we_gpr_i`` - - in - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic[CVA6Cfg.NrCommitPorts-1:0] - - * - ``stall_issue_o`` - - out - - Stall signal, we do not want to fetch any more entries - - TO_BE_COMPLETED - - logic - -Due to cv32a65x configuration, some ports are tied to a static value. These ports do not appear in the above table, they are listed below - -| As EnableAccelerator = 0, -| ``stall_i`` input is tied to 0 -| As RVH = False, -| ``tinst_o`` output is tied to 0 -| As RVF = 0, -| ``fpu_ready_i`` input is tied to 0 -| ``fpu_valid_o`` output is tied to 0 -| ``fpu_fmt_o`` output is tied to 0 -| ``fpu_rm_o`` output is tied to 0 -| ``we_fpr_i`` input is tied to 0 - diff --git a/docs/04_cv32a65x/design/source/port_issue_stage.rst b/docs/04_cv32a65x/design/source/port_issue_stage.rst deleted file mode 100644 index 0fa740fd3c..0000000000 --- a/docs/04_cv32a65x/design/source/port_issue_stage.rst +++ /dev/null @@ -1,263 +0,0 @@ -.. - Copyright 2024 Thales DIS France SAS - Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); - you may not use this file except in compliance with the License. - SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 - You may obtain a copy of the License at https://solderpad.org/licenses/ - - Original Author: Jean-Roch COULON - Thales - -.. _CVA6_issue_stage_ports: - -.. list-table:: **issue_stage module** IO ports - :header-rows: 1 - - * - Signal - - IO - - Description - - connexion - - Type - - * - ``clk_i`` - - in - - Subsystem Clock - - SUBSYSTEM - - logic - - * - ``rst_ni`` - - in - - Asynchronous reset active low - - SUBSYSTEM - - logic - - * - ``flush_unissued_instr_i`` - - in - - TO_BE_COMPLETED - - CONTROLLER - - logic - - * - ``flush_i`` - - in - - TO_BE_COMPLETED - - CONTROLLER - - logic - - * - ``decoded_instr_i`` - - in - - Handshake's data with decode stage - - ID_STAGE - - scoreboard_entry_t[SUPERSCALAR:0] - - * - ``orig_instr_i`` - - in - - instruction value - - ID_STAGE - - logic[SUPERSCALAR:0][31:0] - - * - ``decoded_instr_valid_i`` - - in - - Handshake's valid with decode stage - - ID_STAGE - - logic[SUPERSCALAR:0] - - * - ``is_ctrl_flow_i`` - - in - - Is instruction a control flow instruction - - ID_STAGE - - logic[SUPERSCALAR:0] - - * - ``decoded_instr_ack_o`` - - out - - Handshake's acknowlege with decode stage - - ID_STAGE - - logic[SUPERSCALAR:0] - - * - ``rs1_forwarding_o`` - - out - - rs1 forwarding - - EX_STAGE - - [CVA6Cfg.VLEN-1:0] - - * - ``rs2_forwarding_o`` - - out - - rs2 forwarding - - EX_STAGE - - [CVA6Cfg.VLEN-1:0] - - * - ``fu_data_o`` - - out - - FU data useful to execute instruction - - EX_STAGE - - fu_data_t - - * - ``pc_o`` - - out - - Program Counter - - EX_STAGE - - logic[CVA6Cfg.VLEN-1:0] - - * - ``is_compressed_instr_o`` - - out - - Is compressed instruction - - EX_STAGE - - logic - - * - ``flu_ready_i`` - - in - - Fixed Latency Unit is ready - - EX_STAGE - - logic - - * - ``alu_valid_o`` - - out - - ALU FU is valid - - EX_STAGE - - logic - - * - ``resolve_branch_i`` - - in - - Signaling that we resolved the branch - - EX_STAGE - - logic - - * - ``lsu_ready_i`` - - in - - Load store unit FU is ready - - EX_STAGE - - logic - - * - ``lsu_valid_o`` - - out - - Load store unit FU is valid - - EX_STAGE - - logic - - * - ``branch_valid_o`` - - out - - Branch unit is valid - - EX_STAGE - - logic - - * - ``branch_predict_o`` - - out - - Information of branch prediction - - EX_STAGE - - branchpredict_sbe_t - - * - ``mult_valid_o`` - - out - - Mult FU is valid - - EX_STAGE - - logic - - * - ``csr_valid_o`` - - out - - CSR is valid - - EX_STAGE - - logic - - * - ``x_issue_valid_o`` - - out - - CVXIF FU is valid - - EX_STAGE - - logic - - * - ``x_issue_ready_i`` - - in - - CVXIF is FU ready - - EX_STAGE - - logic - - * - ``x_off_instr_o`` - - out - - CVXIF offloader instruction value - - EX_STAGE - - logic[31:0] - - * - ``trans_id_i`` - - in - - Transaction ID - - EX_STAGE - - logic[CVA6Cfg.NrWbPorts-1:0][CVA6Cfg.TRANS_ID_BITS-1:0] - - * - ``resolved_branch_i`` - - in - - The branch engine uses the write back from the ALU - - EX_STAGE - - bp_resolve_t - - * - ``wbdata_i`` - - in - - TO_BE_COMPLETED - - EX_STAGE - - logic[CVA6Cfg.NrWbPorts-1:0][CVA6Cfg.XLEN-1:0] - - * - ``ex_ex_i`` - - in - - exception from execute stage or CVXIF - - EX_STAGE - - exception_t[CVA6Cfg.NrWbPorts-1:0] - - * - ``wt_valid_i`` - - in - - TO_BE_COMPLETED - - EX_STAGE - - logic[CVA6Cfg.NrWbPorts-1:0] - - * - ``x_we_i`` - - in - - CVXIF write enable - - EX_STAGE - - logic - - * - ``waddr_i`` - - in - - TO_BE_COMPLETED - - EX_STAGE - - logic[CVA6Cfg.NrCommitPorts-1:0][4:0] - - * - ``wdata_i`` - - in - - TO_BE_COMPLETED - - EX_STAGE - - logic[CVA6Cfg.NrCommitPorts-1:0][CVA6Cfg.XLEN-1:0] - - * - ``we_gpr_i`` - - in - - GPR write enable - - EX_STAGE - - logic[CVA6Cfg.NrCommitPorts-1:0] - - * - ``commit_instr_o`` - - out - - Instructions to commit - - COMMIT_STAGE - - scoreboard_entry_t[CVA6Cfg.NrCommitPorts-1:0] - - * - ``commit_ack_i`` - - in - - Commit acknowledge - - COMMIT_STAGE - - logic[CVA6Cfg.NrCommitPorts-1:0] - -Due to cv32a65x configuration, some ports are tied to a static value. These ports do not appear in the above table, they are listed below - -| As PerfCounterEn = 0, -| ``sb_full_o`` output is tied to 0 -| ``stall_issue_o`` output is tied to 0 -| As EnableAccelerator = 0, -| ``stall_i`` input is tied to 0 -| ``issue_instr_o`` output is tied to 0 -| ``issue_instr_hs_o`` output is tied to 0 -| As RVH = False, -| ``tinst_o`` output is tied to 0 -| As RVF = 0, -| ``fpu_ready_i`` input is tied to 0 -| ``fpu_valid_o`` output is tied to 0 -| ``fpu_fmt_o`` output is tied to 0 -| ``fpu_rm_o`` output is tied to 0 -| ``we_fpr_i`` input is tied to 0 -| As IsRVFI = 0, -| ``rvfi_issue_pointer_o`` output is tied to 0 -| ``rvfi_commit_pointer_o`` output is tied to 0 - diff --git a/docs/04_cv32a65x/design/source/port_load_store_unit.rst b/docs/04_cv32a65x/design/source/port_load_store_unit.rst deleted file mode 100644 index ea49497f4c..0000000000 --- a/docs/04_cv32a65x/design/source/port_load_store_unit.rst +++ /dev/null @@ -1,226 +0,0 @@ -.. - Copyright 2024 Thales DIS France SAS - Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); - you may not use this file except in compliance with the License. - SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 - You may obtain a copy of the License at https://solderpad.org/licenses/ - - Original Author: Jean-Roch COULON - Thales - -.. _CVA6_load_store_unit_ports: - -.. list-table:: **load_store_unit module** IO ports - :header-rows: 1 - - * - Signal - - IO - - Description - - connexion - - Type - - * - ``clk_i`` - - in - - Subsystem Clock - - SUBSYSTEM - - logic - - * - ``rst_ni`` - - in - - Asynchronous reset active low - - SUBSYSTEM - - logic - - * - ``flush_i`` - - in - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic - - * - ``stall_st_pending_i`` - - in - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic - - * - ``no_st_pending_o`` - - out - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic - - * - ``fu_data_i`` - - in - - FU data needed to execute instruction - - ISSUE_STAGE - - fu_data_t - - * - ``lsu_ready_o`` - - out - - Load Store Unit is ready - - ISSUE_STAGE - - logic - - * - ``lsu_valid_i`` - - in - - Load Store Unit instruction is valid - - ISSUE_STAGE - - logic - - * - ``load_trans_id_o`` - - out - - Load transaction ID - - ISSUE_STAGE - - logic[CVA6Cfg.TRANS_ID_BITS-1:0] - - * - ``load_result_o`` - - out - - Load result - - ISSUE_STAGE - - logic[CVA6Cfg.XLEN-1:0] - - * - ``load_valid_o`` - - out - - Load result is valid - - ISSUE_STAGE - - logic - - * - ``load_exception_o`` - - out - - Load exception - - ISSUE_STAGE - - exception_t - - * - ``store_trans_id_o`` - - out - - Store transaction ID - - ISSUE_STAGE - - logic[CVA6Cfg.TRANS_ID_BITS-1:0] - - * - ``store_result_o`` - - out - - Store result - - ISSUE_STAGE - - logic[CVA6Cfg.XLEN-1:0] - - * - ``store_valid_o`` - - out - - Store result is valid - - ISSUE_STAGE - - logic - - * - ``store_exception_o`` - - out - - Store exception - - ISSUE_STAGE - - exception_t - - * - ``commit_i`` - - in - - Commit the first pending store - - TO_BE_COMPLETED - - logic - - * - ``commit_ready_o`` - - out - - Commit queue is ready to accept another commit request - - TO_BE_COMPLETED - - logic - - * - ``commit_tran_id_i`` - - in - - Commit transaction ID - - TO_BE_COMPLETED - - logic[CVA6Cfg.TRANS_ID_BITS-1:0] - - * - ``icache_areq_i`` - - in - - Instruction cache input request - - CACHES - - icache_arsp_t - - * - ``icache_areq_o`` - - out - - Instruction cache output request - - CACHES - - icache_areq_t - - * - ``dcache_req_ports_i`` - - in - - Data cache request output - - CACHES - - dcache_req_o_t[2:0] - - * - ``dcache_req_ports_o`` - - out - - Data cache request input - - CACHES - - dcache_req_i_t[2:0] - - * - ``dcache_wbuffer_empty_i`` - - in - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic - - * - ``dcache_wbuffer_not_ni_i`` - - in - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic - - * - ``pmpcfg_i`` - - in - - PMP configuration - - CSR_REGFILE - - riscv::pmpcfg_t[15:0] - - * - ``pmpaddr_i`` - - in - - PMP address - - CSR_REGFILE - - logic[15:0][CVA6Cfg.PLEN-3:0] - -Due to cv32a65x configuration, some ports are tied to a static value. These ports do not appear in the above table, they are listed below - -| As RVA = False, -| ``amo_valid_commit_i`` input is tied to 0 -| ``amo_req_o`` output is tied to 0 -| ``amo_resp_i`` input is tied to 0 -| As RVH = False, -| ``tinst_i`` input is tied to 0 -| ``enable_g_translation_i`` input is tied to 0 -| ``en_ld_st_g_translation_i`` input is tied to 0 -| ``v_i`` input is tied to 0 -| ``ld_st_v_i`` input is tied to 0 -| ``csr_hs_ld_st_inst_o`` output is tied to 0 -| ``vs_sum_i`` input is tied to 0 -| ``vmxr_i`` input is tied to 0 -| ``vsatp_ppn_i`` input is tied to 0 -| ``vs_asid_i`` input is tied to 0 -| ``hgatp_ppn_i`` input is tied to 0 -| ``vmid_i`` input is tied to 0 -| ``vmid_to_be_flushed_i`` input is tied to 0 -| ``gpaddr_to_be_flushed_i`` input is tied to 0 -| ``flush_tlb_vvma_i`` input is tied to 0 -| ``flush_tlb_gvma_i`` input is tied to 0 -| As RVS = False, -| ``enable_translation_i`` input is tied to 0 -| ``en_ld_st_translation_i`` input is tied to 0 -| ``sum_i`` input is tied to 0 -| ``mxr_i`` input is tied to 0 -| ``satp_ppn_i`` input is tied to 0 -| ``asid_i`` input is tied to 0 -| ``asid_to_be_flushed_i`` input is tied to 0 -| ``vaddr_to_be_flushed_i`` input is tied to 0 -| As PRIV = MachineOnly, -| ``priv_lvl_i`` input is tied to MachineMode -| ``ld_st_priv_lvl_i`` input is tied to MAchineMode -| As MMUPresent = 0, -| ``flush_tlb_i`` input is tied to 0 -| As PerfCounterEn = 0, -| ``itlb_miss_o`` output is tied to 0 -| ``dtlb_miss_o`` output is tied to 0 -| As IsRVFI = 0, -| ``rvfi_lsu_ctrl_o`` output is tied to 0 -| ``rvfi_mem_paddr_o`` output is tied to 0 - diff --git a/docs/04_cv32a65x/design/source/port_load_unit.rst b/docs/04_cv32a65x/design/source/port_load_unit.rst deleted file mode 100644 index 9461f65e5d..0000000000 --- a/docs/04_cv32a65x/design/source/port_load_unit.rst +++ /dev/null @@ -1,157 +0,0 @@ -.. - Copyright 2024 Thales DIS France SAS - Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); - you may not use this file except in compliance with the License. - SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 - You may obtain a copy of the License at https://solderpad.org/licenses/ - - Original Author: Jean-Roch COULON - Thales - -.. _CVA6_load_unit_ports: - -.. list-table:: **load_unit module** IO ports - :header-rows: 1 - - * - Signal - - IO - - Description - - connexion - - Type - - * - ``clk_i`` - - in - - Subsystem Clock - - SUBSYSTEM - - logic - - * - ``rst_ni`` - - in - - Asynchronous reset active low - - SUBSYSTEM - - logic - - * - ``flush_i`` - - in - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic - - * - ``valid_i`` - - in - - Load unit input port - - TO_BE_COMPLETED - - logic - - * - ``lsu_ctrl_i`` - - in - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - lsu_ctrl_t - - * - ``pop_ld_o`` - - out - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic - - * - ``valid_o`` - - out - - Load unit result is valid - - TO_BE_COMPLETED - - logic - - * - ``trans_id_o`` - - out - - Load transaction ID - - TO_BE_COMPLETED - - logic[CVA6Cfg.TRANS_ID_BITS-1:0] - - * - ``result_o`` - - out - - Load result - - TO_BE_COMPLETED - - logic[CVA6Cfg.XLEN-1:0] - - * - ``ex_o`` - - out - - Load exception - - TO_BE_COMPLETED - - exception_t - - * - ``translation_req_o`` - - out - - Request address translation - - TO_BE_COMPLETED - - logic - - * - ``vaddr_o`` - - out - - Virtual address - - TO_BE_COMPLETED - - logic[CVA6Cfg.VLEN-1:0] - - * - ``paddr_i`` - - in - - Physical address - - TO_BE_COMPLETED - - logic[CVA6Cfg.PLEN-1:0] - - * - ``ex_i`` - - in - - Excepted which appears before load - - TO_BE_COMPLETED - - exception_t - - * - ``page_offset_o`` - - out - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic[11:0] - - * - ``page_offset_matches_i`` - - in - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic - - * - ``store_buffer_empty_i`` - - in - - Store buffer is empty - - TO_BE_COMPLETED - - logic - - * - ``commit_tran_id_i`` - - in - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic[CVA6Cfg.TRANS_ID_BITS-1:0] - - * - ``req_port_i`` - - in - - Data cache request out - - CACHES - - dcache_req_o_t - - * - ``req_port_o`` - - out - - Data cache request in - - CACHES - - dcache_req_i_t - - * - ``dcache_wbuffer_not_ni_i`` - - in - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic - -Due to cv32a65x configuration, some ports are tied to a static value. These ports do not appear in the above table, they are listed below - -| As RVH = False, -| ``tinst_o`` output is tied to 0 -| ``hs_ld_st_inst_o`` output is tied to 0 -| ``hlvx_inst_o`` output is tied to 0 -| For any HW configuration, -| ``dtlb_hit_i`` input is tied to 1 -| As MMUPresent = 0, -| ``dtlb_ppn_i`` input is tied to 0 - diff --git a/docs/04_cv32a65x/design/source/port_lsu_bypass.rst b/docs/04_cv32a65x/design/source/port_lsu_bypass.rst deleted file mode 100644 index 8eac0f7ab4..0000000000 --- a/docs/04_cv32a65x/design/source/port_lsu_bypass.rst +++ /dev/null @@ -1,75 +0,0 @@ -.. - Copyright 2024 Thales DIS France SAS - Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); - you may not use this file except in compliance with the License. - SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 - You may obtain a copy of the License at https://solderpad.org/licenses/ - - Original Author: Jean-Roch COULON - Thales - -.. _CVA6_lsu_bypass_ports: - -.. list-table:: **lsu_bypass module** IO ports - :header-rows: 1 - - * - Signal - - IO - - Description - - connexion - - Type - - * - ``clk_i`` - - in - - Subsystem Clock - - SUBSYSTEM - - logic - - * - ``rst_ni`` - - in - - Asynchronous reset active low - - SUBSYSTEM - - logic - - * - ``flush_i`` - - in - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic - - * - ``lsu_req_i`` - - in - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - lsu_ctrl_t - - * - ``lsu_req_valid_i`` - - in - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic - - * - ``pop_ld_i`` - - in - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic - - * - ``pop_st_i`` - - in - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic - - * - ``lsu_ctrl_o`` - - out - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - lsu_ctrl_t - - * - ``ready_o`` - - out - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic - - diff --git a/docs/04_cv32a65x/design/source/port_mult.rst b/docs/04_cv32a65x/design/source/port_mult.rst deleted file mode 100644 index 41e342d72f..0000000000 --- a/docs/04_cv32a65x/design/source/port_mult.rst +++ /dev/null @@ -1,75 +0,0 @@ -.. - Copyright 2024 Thales DIS France SAS - Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); - you may not use this file except in compliance with the License. - SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 - You may obtain a copy of the License at https://solderpad.org/licenses/ - - Original Author: Jean-Roch COULON - Thales - -.. _CVA6_mult_ports: - -.. list-table:: **mult module** IO ports - :header-rows: 1 - - * - Signal - - IO - - Description - - connexion - - Type - - * - ``clk_i`` - - in - - Subsystem Clock - - SUBSYSTEM - - logic - - * - ``rst_ni`` - - in - - Asynchronous reset active low - - SUBSYSTEM - - logic - - * - ``flush_i`` - - in - - Flush - - CONTROLLER - - logic - - * - ``fu_data_i`` - - in - - FU data needed to execute instruction - - ISSUE_STAGE - - fu_data_t - - * - ``mult_valid_i`` - - in - - Mult instruction is valid - - ISSUE_STAGE - - logic - - * - ``result_o`` - - out - - Mult result - - ISSUE_STAGE - - logic[CVA6Cfg.XLEN-1:0] - - * - ``mult_valid_o`` - - out - - Mult result is valid - - ISSUE_STAGE - - logic - - * - ``mult_ready_o`` - - out - - Mutl is ready - - ISSUE_STAGE - - logic - - * - ``mult_trans_id_o`` - - out - - Mult transaction ID - - ISSUE_STAGE - - logic[CVA6Cfg.TRANS_ID_BITS-1:0] - - diff --git a/docs/04_cv32a65x/design/source/port_multiplier.rst b/docs/04_cv32a65x/design/source/port_multiplier.rst deleted file mode 100644 index 5dfca691c0..0000000000 --- a/docs/04_cv32a65x/design/source/port_multiplier.rst +++ /dev/null @@ -1,87 +0,0 @@ -.. - Copyright 2024 Thales DIS France SAS - Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); - you may not use this file except in compliance with the License. - SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 - You may obtain a copy of the License at https://solderpad.org/licenses/ - - Original Author: Jean-Roch COULON - Thales - -.. _CVA6_multiplier_ports: - -.. list-table:: **multiplier module** IO ports - :header-rows: 1 - - * - Signal - - IO - - Description - - connexion - - Type - - * - ``clk_i`` - - in - - Subsystem Clock - - SUBSYSTEM - - logic - - * - ``rst_ni`` - - in - - Asynchronous reset active low - - SUBSYSTEM - - logic - - * - ``trans_id_i`` - - in - - Multiplier transaction ID - - Mult - - logic[CVA6Cfg.TRANS_ID_BITS-1:0] - - * - ``mult_valid_i`` - - in - - Multiplier instruction is valid - - Mult - - logic - - * - ``operation_i`` - - in - - Multiplier operation - - Mult - - fu_op - - * - ``operand_a_i`` - - in - - A operand - - Mult - - logic[CVA6Cfg.XLEN-1:0] - - * - ``operand_b_i`` - - in - - B operand - - Mult - - logic[CVA6Cfg.XLEN-1:0] - - * - ``result_o`` - - out - - Multiplier result - - Mult - - logic[CVA6Cfg.XLEN-1:0] - - * - ``mult_valid_o`` - - out - - Mutliplier result is valid - - Mult - - logic - - * - ``mult_ready_o`` - - out - - Multiplier FU is ready - - Mult - - logic - - * - ``mult_trans_id_o`` - - out - - Multiplier transaction ID - - Mult - - logic[CVA6Cfg.TRANS_ID_BITS-1:0] - - diff --git a/docs/04_cv32a65x/design/source/port_ras.rst b/docs/04_cv32a65x/design/source/port_ras.rst deleted file mode 100644 index f0bdb4d401..0000000000 --- a/docs/04_cv32a65x/design/source/port_ras.rst +++ /dev/null @@ -1,61 +0,0 @@ -.. - Copyright 2024 Thales DIS France SAS - Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); - you may not use this file except in compliance with the License. - SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 - You may obtain a copy of the License at https://solderpad.org/licenses/ - - Original Author: Jean-Roch COULON - Thales - -.. _CVA6_ras_ports: - -.. list-table:: **ras module** IO ports - :header-rows: 1 - - * - Signal - - IO - - Description - - connexion - - Type - - * - ``clk_i`` - - in - - Subsystem Clock - - SUBSYSTEM - - logic - - * - ``rst_ni`` - - in - - Asynchronous reset active low - - SUBSYSTEM - - logic - - * - ``push_i`` - - in - - Push address in RAS - - FRONTEND - - logic - - * - ``pop_i`` - - in - - Pop address from RAS - - FRONTEND - - logic - - * - ``data_i`` - - in - - Data to be pushed - - FRONTEND - - logic[CVA6Cfg.VLEN-1:0] - - * - ``data_o`` - - out - - Popped data - - FRONTEND - - ras_t - -Due to cv32a65x configuration, some ports are tied to a static value. These ports do not appear in the above table, they are listed below - -| For any HW configuration, -| ``flush_bp_i`` input is tied to 0 - diff --git a/docs/04_cv32a65x/design/source/port_scoreboard.rst b/docs/04_cv32a65x/design/source/port_scoreboard.rst deleted file mode 100644 index aa3b2bf472..0000000000 --- a/docs/04_cv32a65x/design/source/port_scoreboard.rst +++ /dev/null @@ -1,214 +0,0 @@ -.. - Copyright 2024 Thales DIS France SAS - Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); - you may not use this file except in compliance with the License. - SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 - You may obtain a copy of the License at https://solderpad.org/licenses/ - - Original Author: Jean-Roch COULON - Thales - -.. _CVA6_scoreboard_ports: - -.. list-table:: **scoreboard module** IO ports - :header-rows: 1 - - * - Signal - - IO - - Description - - connexion - - Type - - * - ``clk_i`` - - in - - Subsystem Clock - - SUBSYSTEM - - logic - - * - ``rst_ni`` - - in - - Asynchronous reset active low - - SUBSYSTEM - - logic - - * - ``sb_full_o`` - - out - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic - - * - ``flush_unissued_instr_i`` - - in - - Flush only un-issued instructions - - TO_BE_COMPLETED - - logic - - * - ``flush_i`` - - in - - Flush whole scoreboard - - TO_BE_COMPLETED - - logic - - * - ``rd_clobber_gpr_o`` - - out - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - ariane_pkg::fu_t[2**ariane_pkg::REG_ADDR_SIZE-1:0] - - * - ``rd_clobber_fpr_o`` - - out - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - ariane_pkg::fu_t[2**ariane_pkg::REG_ADDR_SIZE-1:0] - - * - ``rs1_i`` - - in - - rs1 operand address - - issue_read_operands - - logic[ariane_pkg::REG_ADDR_SIZE-1:0] - - * - ``rs1_o`` - - out - - rs1 operand - - issue_read_operands - - logic[CVA6Cfg.XLEN-1:0] - - * - ``rs1_valid_o`` - - out - - rs1 operand is valid - - issue_read_operands - - logic - - * - ``rs2_i`` - - in - - rs2 operand address - - issue_read_operands - - logic[ariane_pkg::REG_ADDR_SIZE-1:0] - - * - ``rs2_o`` - - out - - rs2 operand - - issue_read_operands - - logic[CVA6Cfg.XLEN-1:0] - - * - ``rs2_valid_o`` - - out - - rs2 operand is valid - - issue_read_operands - - logic - - * - ``rs3_i`` - - in - - rs3 operand address - - issue_read_operands - - logic[ariane_pkg::REG_ADDR_SIZE-1:0] - - * - ``rs3_o`` - - out - - rs3 operand - - issue_read_operands - - rs3_len_t - - * - ``rs3_valid_o`` - - out - - rs3 operand is valid - - issue_read_operands - - logic - - * - ``commit_instr_o`` - - out - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - scoreboard_entry_t[CVA6Cfg.NrCommitPorts-1:0] - - * - ``commit_ack_i`` - - in - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic[CVA6Cfg.NrCommitPorts-1:0] - - * - ``decoded_instr_i`` - - in - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - scoreboard_entry_t[ariane_pkg::SUPERSCALAR:0] - - * - ``orig_instr_i`` - - in - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic[ariane_pkg::SUPERSCALAR:0][31:0] - - * - ``decoded_instr_valid_i`` - - in - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic[ariane_pkg::SUPERSCALAR:0] - - * - ``decoded_instr_ack_o`` - - out - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic[ariane_pkg::SUPERSCALAR:0] - - * - ``orig_instr_o`` - - out - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic[ariane_pkg::SUPERSCALAR:0][31:0] - - * - ``issue_instr_valid_o`` - - out - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic[ariane_pkg::SUPERSCALAR:0] - - * - ``issue_ack_i`` - - in - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic[ariane_pkg::SUPERSCALAR:0] - - * - ``resolved_branch_i`` - - in - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - bp_resolve_t - - * - ``trans_id_i`` - - in - - Transaction ID at which to write the result back - - TO_BE_COMPLETED - - logic[CVA6Cfg.NrWbPorts-1:0][CVA6Cfg.TRANS_ID_BITS-1:0] - - * - ``wbdata_i`` - - in - - Results to write back - - TO_BE_COMPLETED - - logic[CVA6Cfg.NrWbPorts-1:0][CVA6Cfg.XLEN-1:0] - - * - ``ex_i`` - - in - - Exception from a functional unit (e.g.: ld/st exception) - - TO_BE_COMPLETED - - exception_t[CVA6Cfg.NrWbPorts-1:0] - - * - ``wt_valid_i`` - - in - - Indicates valid results - - TO_BE_COMPLETED - - logic[CVA6Cfg.NrWbPorts-1:0] - - * - ``x_we_i`` - - in - - Cvxif we for writeback - - TO_BE_COMPLETED - - logic - -Due to cv32a65x configuration, some ports are tied to a static value. These ports do not appear in the above table, they are listed below - -| As EnableAccelerator = 0, -| ``issue_instr_o`` output is tied to 0 -| As IsRVFI = 0, -| ``rvfi_issue_pointer_o`` output is tied to 0 -| ``rvfi_commit_pointer_o`` output is tied to 0 - diff --git a/docs/04_cv32a65x/design/source/port_scoreboard.rst.ori b/docs/04_cv32a65x/design/source/port_scoreboard.rst.ori deleted file mode 100644 index ebd3fc64ef..0000000000 --- a/docs/04_cv32a65x/design/source/port_scoreboard.rst.ori +++ /dev/null @@ -1,229 +0,0 @@ -.. - Copyright 2024 Thales DIS France SAS - Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); - you may not use this file except in compliance with the License. - SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 - You may obtain a copy of the License at https://solderpad.org/licenses/ - - Original Author: Jean-Roch COULON - Thales - -.. _CVA6_scoreboard_ports: - -.. list-table:: scoreboard module IO ports - :header-rows: 1 - - * - Signal - - IO - - Description - - Connection - - Type - - * - ``clk_i`` - - in - - Clock - - TO_BE_COMPLETED - - logic - - * - ``rst_ni`` - - in - - Asynchronous reset active low - - TO_BE_COMPLETED - - logic - - * - ``sb_full_o`` - - out - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic - - * - ``flush_unissued_instr_i`` - - in - - flush only un-issued instructions - - TO_BE_COMPLETED - - logic - - * - ``flush_i`` - - in - - flush whole scoreboard - - TO_BE_COMPLETED - - logic - - * - ``unresolved_branch_i`` - - in - - we have an unresolved branch - - TO_BE_COMPLETED - - logic - - * - ``rd_clobber_gpr_o`` - - out - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - ariane_pkg::fu_t[2**ariane_pkg::REG_ADDR_SIZE-1:0] - - * - ``rd_clobber_fpr_o`` - - out - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - ariane_pkg::fu_t[2**ariane_pkg::REG_ADDR_SIZE-1:0] - - * - ``rs1_i`` - - in - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic[ariane_pkg::REG_ADDR_SIZE-1:0] - - * - ``rs1_o`` - - out - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - riscv::xlen_t - - * - ``rs1_valid_o`` - - out - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic - - * - ``rs2_i`` - - in - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic[ariane_pkg::REG_ADDR_SIZE-1:0] - - * - ``rs2_o`` - - out - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - riscv::xlen_t - - * - ``rs2_valid_o`` - - out - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic - - * - ``rs3_i`` - - in - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic[ariane_pkg::REG_ADDR_SIZE-1:0] - - * - ``rs3_o`` - - out - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - rs3_len_t - - * - ``rs3_valid_o`` - - out - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic - - * - ``commit_instr_o`` - - out - - TO_BE_COMPLETED - - COMMIT_STAGE - - ariane_pkg::scoreboard_entry_t[CVA6Cfg.NrCommitPorts-1:0] - - * - ``commit_ack_i`` - - in - - Advance the commit pointer when acknowledge - - COMMIT_STAGE - - logic[CVA6Cfg.NrCommitPorts-1:0] - - * - ``decoded_instr_i`` - - in - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - ariane_pkg::scoreboard_entry_t - - * - ``orig_instr_i`` - - in - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic[31:0] - - * - ``decoded_instr_valid_i`` - - in - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic - - * - ``decoded_instr_ack_o`` - - out - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic - - * - ``issue_instr_o`` - - out - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - ariane_pkg::scoreboard_entry_t - - * - ``orig_instr_o`` - - out - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic[31:0] - - * - ``issue_instr_valid_o`` - - out - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic - - * - ``issue_ack_i`` - - in - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic - - * - ``resolved_branch_i`` - - in - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - ariane_pkg::bp_resolve_t - - * - ``trans_id_i`` - - in - - transaction ID at which to write the result back - - TO_BE_COMPLETED - - logic[CVA6Cfg.NrWbPorts-1:0][ariane_pkg::TRANS_ID_BITS-1:0] - - * - ``wbdata_i`` - - in - - write data in - - TO_BE_COMPLETED - - logic[CVA6Cfg.NrWbPorts-1:0][riscv::XLEN-1:0] - - * - ``ex_i`` - - in - - exception from a functional unit (e.g.: ld/st exception) - - TO_BE_COMPLETED - - ariane_pkg::exception_t[CVA6Cfg.NrWbPorts-1:0] - - * - ``wt_valid_i`` - - in - - data in is valid - - TO_BE_COMPLETED - - logic[CVA6Cfg.NrWbPorts-1:0] - - * - ``x_we_i`` - - in - - cvxif we for writeback - - TO_BE_COMPLETED - - logic - - * - ``rvfi_issue_pointer_o`` - - out - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic[ariane_pkg::TRANS_ID_BITS-1:0] - - * - ``rvfi_commit_pointer_o`` - - out - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic[CVA6Cfg.NrCommitPorts-1:0][ariane_pkg::TRANS_ID_BITS-1:0] diff --git a/docs/04_cv32a65x/design/source/port_serdiv.rst b/docs/04_cv32a65x/design/source/port_serdiv.rst deleted file mode 100644 index 467df2704a..0000000000 --- a/docs/04_cv32a65x/design/source/port_serdiv.rst +++ /dev/null @@ -1,99 +0,0 @@ -.. - Copyright 2024 Thales DIS France SAS - Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); - you may not use this file except in compliance with the License. - SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 - You may obtain a copy of the License at https://solderpad.org/licenses/ - - Original Author: Jean-Roch COULON - Thales - -.. _CVA6_serdiv_ports: - -.. list-table:: **serdiv module** IO ports - :header-rows: 1 - - * - Signal - - IO - - Description - - connexion - - Type - - * - ``clk_i`` - - in - - Subsystem Clock - - SUBSYSTEM - - logic - - * - ``rst_ni`` - - in - - Asynchronous reset active low - - SUBSYSTEM - - logic - - * - ``id_i`` - - in - - Serdiv translation ID - - Mult - - logic[CVA6Cfg.TRANS_ID_BITS-1:0] - - * - ``op_a_i`` - - in - - A operand - - Mult - - logic[WIDTH-1:0] - - * - ``op_b_i`` - - in - - B operand - - Mult - - logic[WIDTH-1:0] - - * - ``rem`` - - in - - Serdiv operation - - Mult - - logic[1:0]opcode_i,//0:udiv,2:urem,1:div,3: - - * - ``in_vld_i`` - - in - - Serdiv instruction is valid - - Mult - - logic - - * - ``in_rdy_o`` - - out - - Serdiv FU is ready - - Mult - - logic - - * - ``flush_i`` - - in - - Flush - - CONTROLLER - - logic - - * - ``out_vld_o`` - - out - - Serdiv result is valid - - Mult - - logic - - * - ``out_rdy_i`` - - in - - Serdiv is ready - - Mult - - logic - - * - ``id_o`` - - out - - Serdiv transaction ID - - Mult - - logic[CVA6Cfg.TRANS_ID_BITS-1:0] - - * - ``res_o`` - - out - - Serdiv result - - Mult - - logic[WIDTH-1:0] - - diff --git a/docs/04_cv32a65x/design/source/port_store_unit.rst b/docs/04_cv32a65x/design/source/port_store_unit.rst deleted file mode 100644 index eda5a1058b..0000000000 --- a/docs/04_cv32a65x/design/source/port_store_unit.rst +++ /dev/null @@ -1,173 +0,0 @@ -.. - Copyright 2024 Thales DIS France SAS - Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); - you may not use this file except in compliance with the License. - SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 - You may obtain a copy of the License at https://solderpad.org/licenses/ - - Original Author: Jean-Roch COULON - Thales - -.. _CVA6_store_unit_ports: - -.. list-table:: **store_unit module** IO ports - :header-rows: 1 - - * - Signal - - IO - - Description - - connexion - - Type - - * - ``clk_i`` - - in - - Subsystem Clock - - SUBSYSTEM - - logic - - * - ``rst_ni`` - - in - - Asynchronous reset active low - - SUBSYSTEM - - logic - - * - ``flush_i`` - - in - - Flush - - CONTROLLER - - logic - - * - ``stall_st_pending_i`` - - in - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic - - * - ``no_st_pending_o`` - - out - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic - - * - ``store_buffer_empty_o`` - - out - - Store buffer is empty - - TO_BE_COMPLETED - - logic - - * - ``valid_i`` - - in - - Store instruction is valid - - ISSUE_STAGE - - logic - - * - ``lsu_ctrl_i`` - - in - - Data input - - ISSUE_STAGE - - lsu_ctrl_t - - * - ``pop_st_o`` - - out - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic - - * - ``commit_i`` - - in - - Instruction commit - - TO_BE_COMPLETED - - logic - - * - ``commit_ready_o`` - - out - - TO_BE_COMPLETED - - TO_BE_COMPLETED - - logic - - * - ``valid_o`` - - out - - Store result is valid - - ISSUE_STAGE - - logic - - * - ``trans_id_o`` - - out - - Transaction ID - - ISSUE_STAGE - - logic[CVA6Cfg.TRANS_ID_BITS-1:0] - - * - ``result_o`` - - out - - Store result - - ISSUE_STAGE - - logic[CVA6Cfg.XLEN-1:0] - - * - ``ex_o`` - - out - - Store exception output - - TO_BE_COMPLETED - - exception_t - - * - ``translation_req_o`` - - out - - Address translation request - - TO_BE_COMPLETED - - logic - - * - ``vaddr_o`` - - out - - Virtual address - - TO_BE_COMPLETED - - logic[CVA6Cfg.VLEN-1:0] - - * - ``paddr_i`` - - in - - Physical address - - TO_BE_COMPLETED - - logic[CVA6Cfg.PLEN-1:0] - - * - ``ex_i`` - - in - - Exception raised before store - - TO_BE_COMPLETED - - exception_t - - * - ``page_offset_i`` - - in - - Address to be checked - - load_unit - - logic[11:0] - - * - ``page_offset_matches_o`` - - out - - Address check result - - load_unit - - logic - - * - ``req_port_i`` - - in - - Data cache request - - CACHES - - dcache_req_o_t - - * - ``req_port_o`` - - out - - Data cache response - - CACHES - - dcache_req_i_t - -Due to cv32a65x configuration, some ports are tied to a static value. These ports do not appear in the above table, they are listed below - -| As RVA = False, -| ``amo_valid_commit_i`` input is tied to 0 -| ``amo_req_o`` output is tied to 0 -| ``amo_resp_i`` input is tied to 0 -| As IsRVFI = 0, -| ``rvfi_mem_paddr_o`` output is tied to 0 -| As RVH = False, -| ``tinst_o`` output is tied to 0 -| ``hs_ld_st_inst_o`` output is tied to 0 -| ``hlvx_inst_o`` output is tied to 0 -| For any HW configuration, -| ``dtlb_hit_i`` input is tied to 1 - diff --git a/docs/04_cv32a65x/index.rst b/docs/04_cv32a65x/index.rst index 71e91ee8e5..ad7e6d23a7 100644 --- a/docs/04_cv32a65x/index.rst +++ b/docs/04_cv32a65x/index.rst @@ -6,4 +6,4 @@ CV32A65X documentation riscv/unpriv.rst riscv/priv.rst - design/source/index.rst + design/design.rst diff --git a/docs/04_cv32a65x/riscv/priv-isa-cv32a65x.html b/docs/04_cv32a65x/riscv/priv-isa-cv32a65x.html deleted file mode 100644 index 13d9d4fc3a..0000000000 --- a/docs/04_cv32a65x/riscv/priv-isa-cv32a65x.html +++ /dev/null @@ -1,4801 +0,0 @@ - - - - - - - - -The RISC-V Instruction Set Manual for CV32A65X: Volume II: Privileged Architecture - - - - - - -
-
-
-
-

This document describes the RISC-V privileged architecture tailored for -OpenHW Group CV32A65X. -Not relevant parts (e.g. unsupported extensions) of the original -specification are replaced by placeholders.

-
-
-

Contributors to all versions of the spec in alphabetical order (please contact -editors to suggest corrections): Krste Asanović, Peter Ashenden, Rimas -Avižienis, Jacob Bachmeyer, Allen J. Baum, Jonathan Behrens, Paolo Bonzini, Ruslan Bukin, -Christopher Celio, Chuanhua Chang, David Chisnall, Anthony Coulter, Palmer Dabbelt, Monte -Dalrymple, Paul Donahue, Greg Favor, Dennis Ferguson, Marc Gauthier, Andy Glew, -Gary Guo, Mike Frysinger, John Hauser, David Horner, Olof -Johansson, David Kruckemyer, Yunsup Lee, Daniel Lustig, Andrew Lutomirski, Prashanth Mundkur, -Jonathan Neuschäfer, Rishiyur -Nikhil, Stefan O’Rear, Albert Ou, John Ousterhout, David Patterson, Dmitri -Pavlov, Kade Phillips, Josh Scheid, Colin Schmidt, Michael Taylor, Wesley Terpstra, Matt Thomas, Tommy Thorn, Ray -VanDeWalker, Megan Wachs, Steve Wallach, Andrew Waterman, Claire Wolf, -and Reinoud Zandijk..

-
-
-

This document is released under a Creative Commons Attribution 4.0 International License.

-
-
-

This document is a derivative of the RISC-V -privileged specification version 1.9.1 released under following license: ©2010-2017 Andrew Waterman, Yunsup Lee, Rimas -Avižienis, -David Patterson, Krste Asanović. Creative Commons Attribution 4.0 International License.

-
-
-

Contributors to CV32A65X versions of the spec in alphabetical order: -Jean-Roch Coulon, André Sintzoff.

-
-
-
-
-

Preface

-
-
-

Preface to Version for CV32A65X

-
-
-

This document describes the RISC-V privileged architecture tailored for -OpenHW Group CV32A65X.

-
-
-

Preface to Version 20240703

-
-
-

This document describes the RISC-V privileged architecture. This -release, version 20240703, contains the following versions of the RISC-V ISA -modules:

-
- ----- - - - - - - - - - - - - - - -
ModuleVersionStatus

Machine ISA
-Smstateen Extension
-Smcsrind/Sscsrind Extension
-Smepmp
-Smcntrpmf
-Smrnmi Extension
-Smcdeleg
-Smdbltrp
-Supervisor ISA
-Svade Extension
-Svnapot Extension
-Svpbmt Extension
-Svinval Extension
-Svadu Extension
-Sstc
-Sscofpmf
-Ssdbltrp
-Hypervisor ISA
-Shlcofideleg
-Svvptc

1.13
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.13
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-0.1
-1.0

Draft
-Ratified
-Ratified
-Ratified
-Ratified
-Ratified
-Ratified
-Draft
-Draft
-Ratified
-Ratified
-Ratified
-Ratified
-Ratified
-Ratified
-Ratified
-Draft
-Ratified
-Draft
-Ratified

-
-

The following changes have been made since version 1.12 of the Machine and -Supervisor ISAs, which, while not strictly backwards compatible, are not -anticipated to cause software portability problems in practice:

-
-
-
    -
  • -

    Redefined misa.MXL to be read-only, making MXLEN a constant.

    -
  • -
  • -

    Added the constraint that SXLEN≥UXLEN.

    -
  • -
-
-
-

Additionally, the following compatible changes have been -made to the Machine and Supervisor ISAs since version 1.12:

-
-
-
    -
  • -

    Defined the misa.B field to reflect that the B extension has been -implemented.

    -
  • -
  • -

    Defined the misa.V field to reflect that the V extension has been -implemented.

    -
  • -
  • -

    Defined the RV32-only medelegh and hedelegh CSRs.

    -
  • -
  • -

    Defined the misaligned atomicity granule PMA, superseding the proposed Zam -extension.

    -
  • -
  • -

    Allocated interrupt 13 for Sscofpmf LCOFI interrupt.

    -
  • -
  • -

    Defined hardware error and software check exception codes.

    -
  • -
  • -

    Specified synchronization requirements when changing the PBMTE fields -in menvcfg and henvcfg.

    -
  • -
  • -

    Exposed count-overflow interrups to VS-mode via the Shlcofideleg extension.

    -
  • -
  • -

    Relaxed behavior of some HINTs when MXLEN > XLEN.

    -
  • -
-
-
-

Finally, the following clarifications and document improvments have been made -since the last document release:

-
-
-
    -
  • -

    Transliterated the document from LaTeX into AsciiDoc.

    -
  • -
  • -

    Included all ratified extensions through March 2024.

    -
  • -
  • -

    Clarified that "platform- or custom-use" interrupts are actually -"platform-use interrupts", where the platform can choose to make some custom.

    -
  • -
  • -

    Clarified semantics of explicit accesses to CSRs wider than XLEN bits.

    -
  • -
  • -

    Clarified that MXLEN≥SXLEN.

    -
  • -
  • -

    Clarified that WFI is not a HINT instruction.

    -
  • -
  • -

    Clarified that VS-stage page-table accesses set G-stage A/D bits.

    -
  • -
  • -

    Clarified ordering rules when PBMT=IO is used on main-memory regions.

    -
  • -
  • -

    Clarified ordering rules for hardware A/D bit updates.

    -
  • -
  • -

    Clarified that, for a given exception cause, xtval might sometimes -be set to a nonzero value but sometimes not.

    -
  • -
  • -

    Clarified exception behavior of unimplemented or inaccessible CSRs.

    -
  • -
  • -

    Clarified that Svpbmt allows implementations to override additional PMAs.

    -
  • -
  • -

    Replaced the concept of vacant memory regions with inaccessible memory or I/O regions.

    -
  • -
  • -

    Clarified that timer and count-overflow interrupts' arrival in -interrupt-pending registers is not immediate.

    -
  • -
-
-
-

Preface to Version 20211203

-
-
-

This document describes the RISC-V privileged architecture. This -release, version 20211203, contains the following versions of the RISC-V -ISA modules:

-
- ----- - - - - - - - - - - - - - - -
ModuleVersionStatus

Machine ISA
-Supervisor ISA
-Svnapot Extension
-Svpbmt Extension
-Svinval Extension
-Hypervisor ISA

1.12
-1.12
-1.0
-1.0
-1.0
-1.0

Ratified
-Ratified
-Ratified
-Ratified
-Ratified
-Ratified

-
-

The following changes have been made since version 1.11, which, while -not strictly backwards compatible, are not anticipated to cause software -portability problems in practice:

-
-
-
    -
  • -

    Changed MRET and SRET to clear mstatus.MPRV when leaving M-mode.

    -
  • -
  • -

    Reserved additional satp patterns for future use.

    -
  • -
  • -

    Stated that the scause Exception Code field must implement bits 4–0 -at minimum.

    -
  • -
  • -

    Relaxed I/O regions have been specified to follow RVWMO. The previous -specification implied that PPO rules other than fences and -acquire/release annotations did not apply.

    -
  • -
  • -

    Constrained the LR/SC reservation set size and shape when using -page-based virtual memory.

    -
  • -
  • -

    PMP changes require an SFENCE.VMA on any hart that implements -page-based virtual memory, even if VM is not currently enabled.

    -
  • -
  • -

    Allowed for speculative updates of page table entry A bits.

    -
  • -
  • -

    Clarify that if the address-translation algorithm non-speculatively -reaches a PTE in which a bit reserved for future standard use is set, a -page-fault exception must be raised.

    -
  • -
-
-
-

Additionally, the following compatible changes have been made since -version 1.11:

-
-
-
    -
  • -

    Removed the N extension.

    -
  • -
  • -

    Defined the mandatory RV32-only CSR mstatush, which contains most of -the same fields as the upper 32 bits of RV64’s mstatus.

    -
  • -
  • -

    Defined the mandatory CSR mconfigptr, which if nonzero contains the -address of a configuration data structure.

    -
  • -
  • -

    Defined optional mseccfg and mseccfgh CSRs, which control the -machine’s security configuration.

    -
  • -
  • -

    Defined menvcfg, henvcfg, and senvcfg CSRs (and RV32-only -menvcfgh and henvcfgh CSRs), which control various characteristics -of the execution environment.

    -
  • -
  • -

    Designated part of SYSTEM major opcode for custom use.

    -
  • -
  • -

    Permitted the unconditional delegation of less-privileged interrupts.

    -
  • -
  • -

    Added optional big-endian and bi-endian support.

    -
  • -
  • -

    Made priority of load/store/AMO address-misaligned exceptions -implementation-defined relative to load/store/AMO page-fault and -access-fault exceptions.

    -
  • -
  • -

    PMP reset values are now platform-defined.

    -
  • -
  • -

    An additional 48 optional PMP registers have been defined.

    -
  • -
  • -

    Slightly relaxed the atomicity requirement for A and D bit updates -performed by the implementation.

    -
  • -
  • -

    Clarify the architectural behavior of address-translation caches

    -
  • -
  • -

    Added Sv57 and Sv57x4 address translation modes.

    -
  • -
  • -

    Software breakpoint exceptions are permitted to write either 0 or the -pc to xtval.

    -
  • -
  • -

    Clarified that bare S-mode need not support the SFENCE.VMA -instruction.

    -
  • -
  • -

    Specified relaxed constraints for implicit reads of non-idempotent -regions.

    -
  • -
  • -

    Added the Svnapot Standard Extension, along with the N bit in Sv39, -Sv48, and Sv57 PTEs.

    -
  • -
  • -

    Added the Svpbmt Standard Extension, along with the PBMT bits in Sv39, -Sv48, and Sv57 PTEs.

    -
  • -
  • -

    Added the Svinval Standard Extension and associated instructions.

    -
  • -
-
-
-

Finally, the hypervisor architecture proposal has been extensively -revised.

-
-
-

Preface to Version 1.11

-
-
-

This is version 1.11 of the RISC-V privileged architecture. The document -contains the following versions of the RISC-V ISA modules:

-
- ----- - - - - - - - - - - - - - - -
ModuleVersionStatus

Machine ISA
-Supervisor ISA
-Hypervisor ISA

1.11
-1.11
-0.3

Ratified
-Ratified
-Draft

-
-

Changes from version 1.10 include:

-
-
-
    -
  • -

    Moved Machine and Supervisor spec to Ratified status.

    -
  • -
  • -

    Improvements to the description and commentary.

    -
  • -
  • -

    Added a draft proposal for a hypervisor extension.

    -
  • -
  • -

    Specified which interrupt sources are reserved for standard use.

    -
  • -
  • -

    Allocated some synchronous exception causes for custom use.

    -
  • -
  • -

    Specified the priority ordering of synchronous exceptions.

    -
  • -
  • -

    Added specification that xRET instructions may, but are not required -to, clear LR reservations if A extension present.

    -
  • -
  • -

    The virtual-memory system no longer permits supervisor mode to execute -instructions from user pages, regardless of the SUM setting.

    -
  • -
  • -

    Clarified that ASIDs are private to a hart, and added commentary about -the possibility of a future global-ASID extension.

    -
  • -
  • -

    SFENCE.VMA semantics have been clarified.

    -
  • -
  • -

    Made the mstatus.MPP field WARL, rather than WLRL.

    -
  • -
  • -

    Made the unused xip fields WPRI, rather than WIRI.

    -
  • -
  • -

    Made the unused misa fields WARL, rather than WIRI.

    -
  • -
  • -

    Made the unused pmpaddr and pmpcfg fields WARL, rather than WIRI.

    -
  • -
  • -

    Required all harts in a system to employ the same PTE-update scheme as -each other.

    -
  • -
  • -

    Rectified an editing error that misdescribed the mechanism by which -mstatus.xIE is written upon an exception.

    -
  • -
  • -

    Described scheme for emulating misaligned AMOs.

    -
  • -
  • -

    Specified the behavior of the misa and xepc registers in systems -with variable IALIGN.

    -
  • -
  • -

    Specified the behavior of writing self-contradictory values to the -misa register.

    -
  • -
  • -

    Defined the mcountinhibit CSR, which stops performance counters from -incrementing to reduce energy consumption.

    -
  • -
  • -

    Specified semantics for PMP regions coarser than four bytes.

    -
  • -
  • -

    Specified contents of CSRs across XLEN modification.

    -
  • -
  • -

    Moved PLIC chapter into its own document.

    -
  • -
-
-
-

Preface to Version 1.10

-
-
-

This is version 1.10 of the RISC-V privileged architecture proposal. -Changes from version 1.9.1 include:

-
-
-
    -
  • -

    The previous version of this document was released under a Creative -Commons Attribution 4.0 International License by the original authors, -and this and future versions of this document will be released under the -same license.

    -
  • -
  • -

    The explicit convention on shadow CSR addresses has been removed to -reclaim CSR space. Shadow CSRs can still be added as needed.

    -
  • -
  • -

    The mvendorid register now contains the JEDEC code of the core -provider as opposed to a code supplied by the Foundation. This avoids -redundancy and offloads work from the Foundation.

    -
  • -
  • -

    The interrupt-enable stack discipline has been simplified.

    -
  • -
  • -

    An optional mechanism to change the base ISA used by supervisor and -user modes has been added to the mstatus CSR, and the field previously -called Base in misa has been renamed to MXL for consistency.

    -
  • -
  • -

    Clarified expected use of XS to summarize additional extension state -status fields in mstatus.

    -
  • -
  • -

    Optional vectored interrupt support has been added to the mtvec and -stvec CSRs.

    -
  • -
  • -

    The SEIP and UEIP bits in the mip CSR have been redefined to support -software injection of external interrupts.

    -
  • -
  • -

    The mbadaddr register has been subsumed by a more general mtval -register that can now capture bad instruction bits on an illegal -instruction fault to speed instruction emulation.

    -
  • -
  • -

    The machine-mode base-and-bounds translation and protection schemes -have been removed from the specification as part of moving the virtual -memory configuration to sptbr (now satp). Some of the motivation for -the base and bound schemes are now covered by the PMP registers, but -space remains available in mstatus to add these back at a later date -if deemed useful.

    -
  • -
  • -

    In systems with only M-mode, or with both M-mode and U-mode but -without U-mode trap support, the medeleg and mideleg registers now -do not exist, whereas previously they returned zero.

    -
  • -
  • -

    Virtual-memory page faults now have mcause values distinct from -physical-memory access faults. Page-fault exceptions can now be -delegated to S-mode without delegating exceptions generated by PMA and -PMP checks.

    -
  • -
  • -

    An optional physical-memory protection (PMP) scheme has been proposed.

    -
  • -
  • -

    The supervisor virtual memory configuration has been moved from the -mstatus register to the sptbr register. Accordingly, the sptbr -register has been renamed to satp (Supervisor Address Translation and -Protection) to reflect its broadened role.

    -
  • -
  • -

    The SFENCE.VM instruction has been removed in favor of the improved -SFENCE.VMA instruction.

    -
  • -
  • -

    The mstatus bit MXR has been exposed to S-mode via sstatus.

    -
  • -
  • -

    The polarity of the PUM bit in sstatus has been inverted to shorten -code sequences involving MXR. The bit has been renamed to SUM.

    -
  • -
  • -

    Hardware management of page-table entry Accessed and Dirty bits has -been made optional; simpler implementations may trap to software to set -them.

    -
  • -
  • -

    The counter-enable scheme has changed, so that S-mode can control -availability of counters to U-mode.

    -
  • -
  • -

    H-mode has been removed, as we are focusing on recursive -virtualization support in S-mode. The encoding space has been reserved -and may be repurposed at a later date.

    -
  • -
  • -

    A mechanism to improve virtualization performance by trapping S-mode -virtual-memory management operations has been added.

    -
  • -
  • -

    The Supervisor Binary Interface (SBI) chapter has been removed, so -that it can be maintained as a separate specification.

    -
  • -
-
-
-

Preface to Version 1.9.1

-
-
-

This is version 1.9.1 of the RISC-V privileged architecture proposal. -Changes from version 1.9 include:

-
-
-
    -
  • -

    Numerous additions and improvements to the commentary sections.

    -
  • -
  • -

    Change configuration string proposal to be use a search process that -supports various formats including Device Tree String and flattened -Device Tree.

    -
  • -
  • -

    Made misa optionally writable to support modifying base and -supported ISA extensions. CSR address of misa changed.

    -
  • -
  • -

    Added description of debug mode and debug CSRs.

    -
  • -
  • -

    Added a hardware performance monitoring scheme. Simplified the -handling of existing hardware counters, removing privileged versions of -the counters and the corresponding delta registers.

    -
  • -
  • -

    Fixed description of SPIE in presence of user-level interrupts.

    -
  • -
-
-
-
-
-

1. Introduction

-
-
-

This document describes the RISC-V privileged architecture, which covers -all aspects of RISC-V systems beyond the unprivileged ISA, including -privileged instructions as well as additional functionality required for -running operating systems and attaching external devices.

-
-
- - - - - -
- - -
-

Commentary on our design decisions is formatted as in this paragraph, -and can be skipped if the reader is only interested in the specification -itself.

-
-
-
-

We briefly note that the entire privileged-level design described in -this document could be replaced with an entirely different -privileged-level design without changing the unprivileged ISA, and -possibly without even changing the ABI. In particular, this privileged -specification was designed to run existing popular operating systems, -and so embodies the conventional level-based protection model. Alternate -privileged specifications could embody other more flexible -protection-domain models. For simplicity of expression, the text is -written as if this was the only possible privileged architecture.

-
-
-
-
-

1.1. RISC-V Privileged Software Stack Terminology

-
-

This section describes the terminology we use to describe components of -the wide range of possible privileged software stacks for RISC-V.

-
-
-

Figure 1 shows some of the possible software stacks -that can be supported by the RISC-V architecture. The left-hand side -shows a simple system that supports only a single application running on -an application execution environment (AEE). The application is coded to -run with a particular application binary interface (ABI). The ABI -includes the supported user-level ISA plus a set of ABI calls to -interact with the AEE. The ABI hides details of the AEE from the -application to allow greater flexibility in implementing the AEE. The -same ABI could be implemented natively on multiple different host OSs, -or could be supported by a user-mode emulation environment running on a -machine with a different native ISA.

-
-
- - - - - -
- - -
-

Our graphical convention represents abstract interfaces using black -boxes with white text, to separate them from concrete instances of -components implementing the interfaces.

-
-
-
-
-
-privimps -
-
Figure 1. Different implementation stacks supporting various forms of privileged execution.
-
-
-

The middle configuration shows a conventional operating system (OS) that -can support multiprogrammed execution of multiple applications. Each -application communicates over an ABI with the OS, which provides the -AEE. Just as applications interface with an AEE via an ABI, RISC-V -operating systems interface with a supervisor execution environment -(SEE) via a supervisor binary interface (SBI). An SBI comprises the -user-level and supervisor-level ISA together with a set of SBI function -calls. Using a single SBI across all SEE implementations allows a single -OS binary image to run on any SEE. The SEE can be a simple boot loader -and BIOS-style IO system in a low-end hardware platform, or a -hypervisor-provided virtual machine in a high-end server, or a thin -translation layer over a host operating system in an architecture -simulation environment.

-
-
- - - - - -
- - -
-

Most supervisor-level ISA definitions do not separate the SBI from the -execution environment and/or the hardware platform, complicating -virtualization and bring-up of new hardware platforms.

-
-
-
-
-

The rightmost configuration shows a virtual machine monitor -configuration where multiple multiprogrammed OSs are supported by a -single hypervisor. Each OS communicates via an SBI with the hypervisor, -which provides the SEE. The hypervisor communicates with the hypervisor -execution environment (HEE) using a hypervisor binary interface (HBI), -to isolate the hypervisor from details of the hardware platform.

-
-
- - - - - -
- - -
-

The ABI, SBI, and HBI are still a work-in-progress, but we are now -prioritizing support for Type-2 hypervisors where the SBI is provided -recursively by an S-mode OS.

-
-
-
-
-

Hardware implementations of the RISC-V ISA will generally require -additional features beyond the privileged ISA to support the various -execution environments (AEE, SEE, or HEE).

-
-
-
-

1.2. Privilege Levels

-
-

At any time, a RISC-V hardware thread (hart) is running at some -privilege level encoded as a mode in one or more CSRs (control and -status registers). Three RISC-V privilege levels are currently defined -as shown in Table 1.

-
- - ------ - - - - - - - - - - - - - - - - -
Table 1. RISC-V privilege levels.
LevelEncodingNameAbbreviation

0
-1
-2
-3

00
-01
-10
-11

User/Application
-Supervisor
-Reserved
-Machine

U
-S

-M

-
-

Privilege levels are used to provide protection between different -components of the software stack, and attempts to perform operations not -permitted by the current privilege mode will cause an exception to be -raised. These exceptions will normally cause traps into an underlying -execution environment.

-
-
- - - - - -
- - -
-

In the description, we try to separate the privilege level for which -code is written, from the privilege mode in which it runs, although the -two are often tied. For example, a supervisor-level operating system can -run in supervisor-mode on a system with three privilege modes, but can -also run in user-mode under a classic virtual machine monitor on systems -with two or more privilege modes. In both cases, the same -supervisor-level operating system binary code can be used, coded to a -supervisor-level SBI and hence expecting to be able to use -supervisor-level privileged instructions and CSRs. When running a guest -OS in user mode, all supervisor-level actions will be trapped and -emulated by the SEE running in the higher-privilege level.

-
-
-
-
-

The machine level has the highest privileges and is the only mandatory -privilege level for a RISC-V hardware platform. Code run in machine-mode -(M-mode) is usually inherently trusted, as it has low-level access to -the machine implementation. M-mode can be used to manage secure -execution environments on RISC-V. User-mode (U-mode) and supervisor-mode -(S-mode) are intended for conventional application and operating system -usage respectively.

-
-
-

Each privilege level has a core set of privileged ISA extensions with -optional extensions and variants. For example, machine-mode supports an -optional standard extension for memory protection. Also, supervisor mode -can be extended to support Type-2 hypervisor execution as described in -Chapter 14.

-
-
-

Implementations might provide anywhere from 1 to 3 privilege modes -trading off reduced isolation for lower implementation cost, as shown in -Table 2.

-
- - ----- - - - - - - - - - - - - - - -
Table 2. Supported combination of privilege modes.
Number of levelsSupported ModesIntended Usage

1
-2
-3

M
-M, U
-M, S, U

Simple embedded systems
-Secure embedded systems
-Systems running Unix-like operating systems

-
-

All hardware implementations must provide M-mode, as this is the only -mode that has unfettered access to the whole machine. The simplest -RISC-V implementations may provide only M-mode, though this will provide -no protection against incorrect or malicious application code.

-
-
- - - - - -
- - -
-

The lock feature of the optional PMP facility can provide some limited -protection even with only M-mode implemented.

-
-
-
-
-

Many RISC-V implementations will also support at least user mode -(U-mode) to protect the rest of the system from application code. -Supervisor mode (S-mode) can be added to provide isolation between a -supervisor-level operating system and the SEE.

-
-
-

A hart normally runs application code in U-mode until some trap (e.g., a -supervisor call or a timer interrupt) forces a switch to a trap handler, -which usually runs in a more privileged mode. The hart will then execute -the trap handler, which will eventually resume execution at or after the -original trapped instruction in U-mode. Traps that increase privilege -level are termed vertical traps, while traps that remain at the same -privilege level are termed horizontal traps. The RISC-V privileged -architecture provides flexible routing of traps to different privilege -layers.

-
-
- - - - - -
- - -
-

Horizontal traps can be implemented as vertical traps that return -control to a horizontal trap handler in the less-privileged mode.

-
-
-
-
-
-

1.3. Debug Mode

-
-

Implementations may also include a debug mode to support off-chip -debugging and/or manufacturing test. Debug mode (D-mode) can be -considered an additional privilege mode, with even more access than -M-mode. The separate debug specification proposal describes operation of -a RISC-V hart in debug mode. Debug mode reserves a few CSR addresses -that are only accessible in D-mode, and may also reserve some portions -of the physical address space on a platform.

-
-
-
-
-
-

2. Control and Status Registers (CSRs)

-
-
-

The SYSTEM major opcode is used to encode all privileged instructions in -the RISC-V ISA. These can be divided into two main classes: those that -atomically read-modify-write control and status registers (CSRs), which -are defined in the Zicsr extension, and all other privileged -instructions. The privileged architecture requires the Zicsr extension; -which other privileged instructions are required depends on the -privileged-architecture feature set.

-
-
-

In addition to the unprivileged state described in Volume I of this -manual, an implementation may contain additional CSRs, accessible by -some subset of the privilege levels using the CSR instructions described -in Volume I. In this chapter, we map out the CSR address space. The -following chapters describe the function of each of the CSRs according -to privilege level, as well as the other privileged instructions which -are generally closely associated with a particular privilege level. Note -that although CSRs and instructions are associated with one privilege -level, they are also accessible at all higher privilege levels.

-
-
-

Standard CSRs do not have side effects on reads but may have side -effects on writes.

-
-
-

2.1. CSR Address Mapping Conventions

-
-

The standard RISC-V ISA sets aside a 12-bit encoding space (csr[11:0]) -for up to 4,096 CSRs. By convention, the upper 4 bits of the CSR address -(csr[11:8]) are used to encode the read and write accessibility of the -CSRs according to privilege level as shown in Table 3. The top two bits (csr[11:10]) indicate whether the register is read/write (00,01, or 10) or read-only (11). The next two bits (csr[9:8]) encode the lowest privilege level that can access the CSR.

-
-
- - - - - -
- - -
-

The CSR address convention uses the upper bits of the CSR address to -encode default access privileges. This simplifies error checking in the -hardware and provides a larger CSR space, but does constrain the mapping -of CSRs into the address space.

-
-
-

Implementations might allow a more-privileged level to trap otherwise -permitted CSR accesses by a less-privileged level to allow these -accesses to be intercepted. This change should be transparent to the -less-privileged software.

-
-
-
-
-

Instructions that access a non-existent CSR are reserved. -Attempts to access a CSR without appropriate privilege level -raise illegal-instruction exceptions or, as described in -[sec:hcauses], virtual-instruction exceptions. -Attempts to write a read-only register raise illegal-instruction exceptions. -A read/write register might also contain some bits that are -read-only, in which case writes to the read-only bits are ignored.

-
-
-

Table 3 also indicates the convention to -allocate CSR addresses between standard and custom uses. The CSR -addresses designated for custom uses will not be redefined by future -standard extensions.

-
-
-

Machine-mode standard read-write CSRs 0x7A0-0x7BF are reserved for -use by the debug system. Of these CSRs, 0x7A0-0x7AF are accessible -to machine mode, whereas 0x7B0-0x7BF are only visible to debug mode. -Implementations should raise illegal-instruction exceptions on -machine-mode access to the latter set of registers.

-
-
- - - - - -
- - -
-

Effective virtualization requires that as many instructions run natively -as possible inside a virtualized environment, while any privileged -accesses trap to the virtual machine monitor. (Goldberg, 1974) CSRs that are read-only -at some lower privilege level are shadowed into separate CSR addresses -if they are made read-write at a higher privilege level. This avoids -trapping permitted lower-privilege accesses while still causing traps on -illegal accesses. Currently, the counters are the only shadowed CSRs.

-
-
-
-
-
-

2.2. CSR Listing

-
-

Table 4-Table 8 list the CSRs that -have currently been allocated CSR addresses. The timers, counters, and -floating-point CSRs are standard unprivileged CSRs. The other registers -are used by privileged code, as described in the following chapters. -Note that not all registers are required on all implementations.

-
- - ---------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 3. Allocation of RISC-V CSR address ranges.

CSR Address

Hex

Use and Accessibility

[11:10]

[9:8]

[7:4]

Unprivileged and User-Level CSRs

00

00

XXXX

0x000-0x0FF

Standard read/write

01

00

XXXX

0x400-0x4FF

Standard read/write

10

00

XXXX

0x800-0x8FF

Custom read/write

11

00

0XXX

0xC00-0xC7F

Standard read-only

11

00

10XX

0xC80-0xCBF

Standard read-only

11

00

11XX

0xCC0-0xCFF

Custom read-only

Supervisor-Level CSRs

00

01

XXXX

0x100-0x1FF

Standard read/write

01

01

0XXX

0x500-0x57F

Standard read/write

01

01

10XX

0x580-0x5BF

Standard read/write

01

01

11XX

0x5C0-0x5FF

Custom read/write

10

01

0XXX

0x900-0x97F

Standard read/write

10

01

10XX

0x980-0x9BF

Standard read/write

10

01

11XX

0x9C0-0x9FF

Custom read/write

11

01

0XXX

0xD00-0xD7F

Standard read-only

11

01

10XX

0xD80-0xDBF

Standard read-only

11

01

11XX

0xDC0-0xDFF

Custom read-only

Hypervisor and VS CSRs

00

10

XXXX

0x200-0x2FF

Standard read/write

01

10

0XXX

0x600-0x67F

Standard read/write

01

10

10XX

0x680-0x6BF

Standard read/write

01

10

11XX

0x6C0-0x6FF

Custom read/write

10

10

0XXX

0xA00-0xA7F

Standard read/write

10

10

10XX

0xA80-0xABF

Standard read/write

10

10

11XX

0xAC0-0xAFF

Custom read/write

11

10

0XXX

0xE00-0xE7F

Standard read-only

11

10

10XX

0xE80-0xEBF

Standard read-only

11

10

11XX

0xEC0-0xEFF

Custom read-only

Machine-Level CSRs

00

11

XXXX

0x300-0x3FF

Standard read/write

01

11

0XXX

0x700-0x77F

Standard read/write

01

11

100X

0x780-0x79F

Standard read/write

01

11

1010

0x7A0-0x7AF

Standard read/write debug CSRs

01

11

1011

0x7B0-0x7BF

Debug-mode-only CSRs

01

11

11XX

0x7C0-0x7FF

Custom read/write

10

11

0XXX

0xB00-0xB7F

Standard read/write

10

11

10XX

0xB80-0xBBF

Standard read/write

10

11

11XX

0xBC0-0xBFF

Custom read/write

11

11

0XXX

0xF00-0xF7F

Standard read-only

11

11

10XX

0xF80-0xFBF

Standard read-only

11

11

11XX

0xFC0-0xFFF

Custom read-only

-
- - ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 4. Currently allocated RISC-V unprivileged CSR addresses.
NumberPrivilegeNameDescription

Unprivileged Floating-Point CSRs

0x001
-0x002
-0x003

URW
-URW
-URW

fflags
-frm
-fcsr

Floating-Point Accrued Exceptions.
-Floating-Point Dynamic Rounding Mode.
-Floating-Point Control and Status Register (frm +fflags).

Unprivileged Zicfiss extension CSR

0x011

URW

ssp

Shadow Stack Pointer.

Unprivileged Counter/Timers

0xC00
-0xC01
-0xC02
-0xC03
-0xC04
-  
-0xC1F
-0xC80
-0xC81
-0xC82
-0xC83
-0xC84

-0xC9F

URO
-URO
-URO
-URO
-URO

-URO
-URO
-URO
-URO
-URO
-URO

-URO

cycle
-time
-instret
-hpmcounter3
-hpmcounter4
-⋮
-hpmcounter31
-cycleh
-timeh
-instreth
-hpmcounter3h
-hpmcounter4h
-⋮
-hpmcounter31h

Cycle counter for RDCYCLE instruction.
-Timer for RDTIME instruction.
-Instructions-retired counter for RDINSTRET instruction.
-Performance-monitoring counter.
-Performance-monitoring counter.

-Performance-monitoring counter.
-Upper 32 bits of cycle, RV32 only.
-Upper 32 bits of time, RV32 only.
-Upper 32 bits of instret, RV32 only.
-Upper 32 bits of hpmcounter3, RV32 only.
-Upper 32 bits of hpmcounter4, RV32 only.

-Upper 32 bits of hpmcounter31, RV32 only.

-
- - ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 5. Currently allocated RISC-V supervisor-level CSR addresses.
NumberPrivilegeNameDescription

Supervisor Trap Setup

0x100
-0x104
-0x105
-0x106

SRW
-SRW
-SRW
-SRW

sstatus
-sie
-stvec
-scounteren

Supervisor status register.
-Supervisor interrupt-enable register.
-Supervisor trap handler base address.
-Supervisor counter enable.

Supervisor Configuration

0x10A

SRW

senvcfg

Supervisor environment configuration register.

Supervisor Counter Setup

0x120

SRW

scountinhibit

Supervisor counter-inhibit register.

Supervisor Trap Handling

0x140
-0x141
-0x142
-0x143
-0x144
-0xDA0

SRW
-SRW
-SRW
-SRW
-SRW
-SRO

sscratch
-sepc
-scause
-stval
-sip
-scountovf

Scratch register for supervisor trap handlers.
-Supervisor exception program counter.
-Supervisor trap cause.
-Supervisor bad address or instruction.
-Supervisor interrupt pending.
-Supervisor count overflow.

Supervisor Protection and Translation

0x180

SRW

satp

Supervisor address translation and protection.

Debug/Trace Registers

0x5A8

SRW

scontext

Supervisor-mode context register.

Supervisor State Enable Registers

0x10C
- 0x10D
- 0x10E
- 0x10F

SRW
- SRW
- SRW
- SRW

sstateen0
- sstateen1
- sstateen2
- sstateen3

Supervisor State Enable 0 Register.
- Supervisor State Enable 1 Register.
- Supervisor State Enable 2 Register.
- Supervisor State Enable 3 Register.

-
- - ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 6. Currently allocated RISC-V hypervisor and VS CSR addresses.
NumberPrivilegeNameDescription

Hypervisor Trap Setup

0x600
-0x602
-0x603
-0x604
-0x606
-0x607
-0x612

HRW
-HRW
-HRW
-HRW
-HRW
-HRW
-HRW

hstatus
-hedeleg
-hideleg
-hie
-hcounteren
-hgeie
-hedelegh

Hypervisor status register.
-Hypervisor exception delegation register.
-Hypervisor interrupt delegation register.
-Hypervisor interrupt-enable register.
-Hypervisor counter enable.
-Hypervisor guest external interrupt-enable register.
-Upper 32 bits of hedeleg, RV32 only.

Hypervisor Trap Handling

0x643
-0x644
-0x645
-0x64A
-0xE12

HRW
-HRW
-HRW
-HRW
-HRO

htval
-hip
-hvip
-htinst
-hgeip

Hypervisor bad guest physical address.
-Hypervisor interrupt pending.
-Hypervisor virtual interrupt pending.
-Hypervisor trap instruction (transformed).
-Hypervisor guest external interrupt pending.

Hypervisor Configuration

0x60A
-0x61A

HRW
-HRM

henvcfg
-henvcfgh

Hypervisor environment configuration register.
-Upper 32 bits of henvcfg, RV32 only.

Hypervisor Protection and Translation

0x680

HRW

hgatp

Hypervisor guest address translation and protection.

Debug/Trace Registers

0x6A8

HRW

hcontext

Hypervisor-mode context register.

Hypervisor Counter/Timer Virtualization Registers

0x605
-0x615

HRW
-HRW

htimedelta
-htimedeltah

Delta for VS/VU-mode timer.
-Upper 32 bits of htimedelta, RV32 only.

Hypervisor State Enable Registers

0x60C
- 0x60D
- 0x60E
- 0x60F
- 0x61C
- 0x61D
- 0x61E
- 0x61F

HRW
- HRW
- HRW
- HRW
- HRW
- HRW
- HRW
- HRW

hstateen0
- hstateen1
- hstateen2
- hstateen3
- hstateen0h
- hstateen1h
- hstateen2h
- hstateen3h

Hypervisor State Enable 0 Register.
- Hypervisor State Enable 1 Register.
- Hypervisor State Enable 2 Register.
- Hypervisor State Enable 3 Register.
- Upper 32 bits of Hypervisor State Enable 0 Register, RV32 only.
- Upper 32 bits of Hypervisor State Enable 1 Register, RV32 only.
- Upper 32 bits of Hypervisor State Enable 2 Register, RV32 only.
- Upper 32 bits of Hypervisor State Enable 3 Register, RV32 only.

Virtual Supervisor Registers

0x200
-0x204
-0x205
-0x240
-0x241
-0x242
-0x243
-0x244
-0x280

HRW
-HRW
-HRW
-HRW
-HRW
-HRW
-HRW
-HRW
-HRW

vsstatus
-vsie
-vstvec
-vsscratch
-vsepc
-vscause
-vstval
-vsip
-vsatp

Virtual supervisor status register.
-Virtual supervisor interrupt-enable register.
-Virtual supervisor trap handler base address.
-Virtual supervisor scratch register.
-Virtual supervisor exception program counter.
-Virtual supervisor trap cause.
-Virtual supervisor bad address or instruction.
-Virtual supervisor interrupt pending.
-Virtual supervisor address translation and protection.

-
- - ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 7. Currently allocated RISC-V machine-level CSR addresses.
NumberPrivilegeNameDescription

Machine Information Registers

0xF11
-0xF12
-0xF13
-0xF14
-0xF15

MRO
-MRO
-MRO
-MRO
-MRO

mvendorid
-marchid
-mimpid
-mhartid
-mconfigptr

Vendor ID.
-Architecture ID.
-Implementation ID.
-Hardware thread ID.
-Pointer to configuration data structure.

Machine Trap Setup

0x300
-0x301
-0x302
-0x303
-0x304
-0x305
-0x306
-0x310
-0x312

MRW
-MRW
-MRW
-MRW
-MRW
-MRW
-MRW
-MRW
-MRW

mstatus
-misa
-medeleg
-mideleg
-mie
-mtvec
-mcounteren
-mstatush
-medelegh

Machine status register.
-ISA and extensions
-Machine exception delegation register.
-Machine interrupt delegation register.
-Machine interrupt-enable register.
-Machine trap-handler base address.
-Machine counter enable.
-Additional machine status register, RV32 only.
-Upper 32 bits of medeleg, RV32 only.

Machine Trap Handling

0x340
-0x341
-0x342
-0x343
-0x344
-0x34A
-0x34B

MRW
-MRW
-MRW
-MRW
-MRW
-MRW
-MRW

mscratch
-mepc
-mcause
-mtval
-mip
-mtinst
-mtval2

Scratch register for machine trap handlers.
-Machine exception program counter.
-Machine trap cause.
-Machine bad address or instruction.
-Machine interrupt pending.
-Machine trap instruction (transformed).
-Machine bad guest physical address.

Machine Configuration

0x30A
-0x31A
-0x747
-0x757

MRW
-MRW
-MRW
-MRW

menvcfg
-menvcfgh
-mseccfg
-mseccfgh

Machine environment configuration register.
-Upper 32 bits of menvcfg, RV32 only.
-Machine security configuration register.
-Upper 32 bits of mseccfg, RV32 only.

Machine Memory Protection

0x3A0
-0x3A1
-0x3A2
-0x3A3

-0x3AE
-0x3AF
-0x3B0
-0x3B1

-0x3EF

MRW
-MRW
-MRW
-MRW

-MRW
-MRW
-MRW
-MRW

-MRW

pmpcfg0
-pmpcfg1
-pmpcfg2
-pmpcfg3
-⋯
-pmpcfg14
-pmpcfg15
-pmpaddr0
-pmpaddr1
-⋯
-pmpaddr63

Physical memory protection configuration.
-Physical memory protection configuration, RV32 only.
-Physical memory protection configuration.
-Physical memory protection configuration, RV32 only.

-Physical memory protection configuration.
-Physical memory protection configuration, RV32 only.
-Physical memory protection address register.
-Physical memory protection address register.

-Physical memory protection address register.

Machine State Enable Registers

0x30C
- 0x30D
- 0x30E
- 0x30F
- 0x31C
- 0x31D
- 0x31E
- 0x31F

MRW
- MRW
- MRW
- MRW
- MRW
- MRW
- MRW
- MRW

mstateen0
- mstateen1
- mstateen2
- mstateen3
- mstateen0h
- mstateen1h
- mstateen2h
- mstateen3h

Machine State Enable 0 Register.
- Machine State Enable 1 Register.
- Machine State Enable 2 Register.
- Machine State Enable 3 Register.
- Upper 32 bits of Machine State Enable 0 Register, RV32 only.
- Upper 32 bits of Machine State Enable 1 Register, RV32 only.
- Upper 32 bits of Machine State Enable 2 Register, RV32 only.
- Upper 32 bits of Machine State Enable 3 Register, RV32 only.

-
- - ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 8. Currently allocated RISC-V machine-level CSR addresses.
NumberPrivilegeNameDescription

Machine Non-Maskable Interrupt Handling

0x740
-0x741
-0x742
-0x744

MRW
-MRW
-MRW
-MRW

mnscratch
-mnepc
-mncause
-mnstatus

Resumable NMI scratch register.
-Resumable NMI program counter.
-Resumable NMI cause.
-Resumable NMI status.

Machine Counter/Timers

0xB00
-0xB02
-0xB03
-0xB04

-0xB1F
-0xB80
-0xB82
-0xB83
-0xB84

-0xB9F

MRW
-MRW
-MRW
-MRW

-MRW
-MRW
-MRW
-MRW
-MRW

-MRW

mcycle
-minstret
-mhpmcounter3
-mhpmcounter4
-⋮
-mhpmcounter31
-mcycleh
-minstreth
-mhpmcounter3h
-mhpmcounter4h
-⋮ -mhpmcounter31h

Machine cycle counter.
-Machine instructions-retired counter.
-Machine performance-monitoring counter.
-Machine performance-monitoring counter.

-Machine performance-monitoring counter.
-Upper 32 bits of mcycle, RV32 only.
-Upper 32 bits of minstret, RV32 only.
-Upper 32 bits of mhpmcounter3, RV32 only.
-Upper 32 bits of mhpmcounter4, RV32 only.

-Upper 32 bits of mhpmcounter31, RV32 only.

Machine Counter Setup

0x320
-0x323
-0x324

-0x33F
-0x723
-0x724

-0x73F

MRW
-MRW
-MRW

-MRW
-MRW
-MRW

-MRW

mcountinhibit
-mhpmevent3
-mhpmevent4
-⋮
-mhpmevent31
-mhpmevent3h
-mhpmevent4h
-⋮
-mhpmevent31h

Machine counter-inhibit register.
-Machine performance-monitoring event selector.
-Machine performance-monitoring event selector.

-Machine performance-monitoring event selector.
-Upper 32 bits of mhpmevent3, RV32 only.
-Upper 32 bits of mhpmevent4, RV32 only.

-Upper 32 bits of mhpmevent31, RV32 only.

Debug/Trace Registers (shared with Debug Mode)

0x7A0
-0x7A1
-0x7A2
-0x7A3
-0x7A8

MRW
-MRW
-MRW
-MRW
-MRW

tselect
-tdata1
-tdata2
-tdata3
-mcontext

Debug/Trace trigger register select.
-First Debug/Trace trigger data register.
-Second Debug/Trace trigger data register.
-Third Debug/Trace trigger data register.
-Machine-mode context register.

Debug Mode Registers

0x7B0
-0x7B1
-0x7B2
-0x7B3

DRW
-DRW
-DRW
-DRW

dcsr
-dpc
-dscratch0
-dscratch1

Debug control and status register.
-Debug program counter.
-Debug scratch register 0.
-Debug scratch register 1.

-
-
-

2.3. CSR Field Specifications

-
-

The following definitions and abbreviations are used in specifying the -behavior of fields within the CSRs.

-
-
-

2.3.1. Reserved Writes Preserve Values, Reads Ignore Values (WPRI)

-
-

Some whole read/write fields are reserved for future use. Software -should ignore the values read from these fields, and should preserve the -values held in these fields when writing values to other fields of the -same register. For forward compatibility, implementations that do not -furnish these fields must make them read-only zero. These fields are -labeled WPRI in the register descriptions.

-
-
- - - - - -
- - -
-

To simplify the software model, any backward-compatible future -definition of previously reserved fields within a CSR must cope with the -possibility that a non-atomic read/modify/write sequence is used to -update other fields in the CSR. Alternatively, the original CSR -definition must specify that subfields can only be updated atomically, -which may require a two-instruction clear bit/set bit sequence in -general that can be problematic if intermediate values are not legal.

-
-
-
-
-
- -
-

Some read/write CSR fields specify behavior for only a subset of -possible bit encodings, with other bit encodings reserved. Software -should not write anything other than legal values to such a field, and -should not assume a read will return a legal value unless the last write -was of a legal value, or the register has not been written since another -operation (e.g., reset) set the register to a legal value. These fields -are labeled WLRL in the register descriptions.

-
-
- - - - - -
- - -
-

Hardware implementations need only implement enough state bits to -differentiate between the supported values, but must always return the -complete specified bit-encoding of any supported value when read.

-
-
-
-
-

Implementations are permitted but not required to raise an -illegal-instruction exception if an instruction attempts to write a -non-supported value to a WLRL field. Implementations can return arbitrary -bit patterns on the read of a WLRL field when the last write was of an -illegal value, but the value returned should deterministically depend on -the illegal written value and the value of the field prior to the write.

-
-
-
- -
-

Some read/write CSR fields are only defined for a subset of bit -encodings, but allow any value to be written while guaranteeing to -return a legal value whenever read. Assuming that writing the CSR has no -other side effects, the range of supported values can be determined by -attempting to write a desired setting then reading to see if the value -was retained. These fields are labeled WARL in the register descriptions.

-
-
-

Implementations will not raise an exception on writes of unsupported -values to a WARL field. Implementations can return any legal value on the -read of a WARL field when the last write was of an illegal value, but the -legal value returned should deterministically depend on the illegal -written value and the architectural state of the hart.

-
-
-
-
-

2.4. CSR Field Modulation

-
-

If a write to one CSR changes the set of legal values allowed for a -field of a second CSR, then unless specified otherwise, the second CSR’s -field immediately gets an UNSPECIFIED value from among its new legal values. This -is true even if the field’s value before the write remains legal after -the write; the value of the field may be changed in consequence of the -write to the controlling CSR.

-
-
- - - - - -
- - -
-

As a special case of this rule, the value written to one CSR may control -whether a field of a second CSR is writable (with multiple legal values) -or is read-only. When a write to the controlling CSR causes the second -CSR’s field to change from previously read-only to now writable, that -field immediately gets an UNSPECIFIED but legal value, unless specified otherwise.

-
-
-
-

Some CSR fields are, when writable, defined as aliases of other CSR -fields. Let x be such a CSR field, and let y be the CSR field it aliases when writable. If a write to a controlling CSR causes field x to change from previously read-only to now writable, the new value of x is not UNSPECIFIED but instead immediately reflects the existing value of its alias y, as required.

-
-
-
-
-

A change to the value of a CSR for this reason is not a write to the -affected CSR and thus does not trigger any side effects specified for -that CSR.

-
-
-
-

2.5. Implicit Reads of CSRs

-
-

Implementations sometimes perform implicit reads of CSRs. (For -example, all S-mode instruction fetches implicitly read the satp CSR.) -Unless otherwise specified, the value returned by an implicit read of a -CSR is the same value that would have been returned by an explicit read -of the CSR, using a CSR-access instruction in a sufficient privilege -mode.

-
-
-
-

2.6. CSR Width Modulation

-
-

If the width of a CSR is changed (for example, by changing SXLEN or -UXLEN, as described in Section 3.1.6.3), the -values of the writable fields and bits of the new-width CSR are, -unless specified otherwise, determined from the previous-width CSR as -though by this algorithm:

-
-
-
    -
  1. -

    The value of the previous-width CSR is copied to a temporary register -of the same width.

    -
  2. -
  3. -

    For the read-only bits of the previous-width CSR, the bits at the same -positions in the temporary register are set to zeros.

    -
  4. -
  5. -

    The width of the temporary register is changed to the new width. If -the new width W is narrower than the previous width, the -least-significant W bits of the temporary register are -retained and the more-significant bits are discarded. If the new width -is wider than the previous width, the temporary register is -zero-extended to the wider width.

    -
  6. -
  7. -

    Each writable field of the new-width CSR takes the value of the bits -at the same positions in the temporary register.

    -
  8. -
-
-
-

Changing the width of a CSR is not a read or write of the CSR and thus -does not trigger any side effects.

-
-
-
-

2.7. Explicit Accesses to CSRs Wider than XLEN

-
-

If a standard CSR is wider than XLEN bits, then an explicit read -of the CSR returns the register’s least-significant XLEN bits, -and an explicit write to the CSR modifies only the register’s -least-significant XLEN bits, leaving the upper bits unchanged.

-
-
-

Some standard CSRs, such as the counter CSRs of extension -Zicntr, are always 64 bits, even when XLEN=32 (RV32). -For each such 64-bit CSR (for example, counter time), -a corresponding 32-bit high-half CSR is usually defined with -the same name but with the letter ‘h’ appended at the end (timeh). -The high-half CSR aliases bits 63:32 of its namesake -64-bit CSR, thus providing a way for RV32 software -to read and modify the otherwise-unreachable 32 bits.

-
-
-

Standard high-half CSRs are accessible only when -the base RISC-V instruction set is RV32 (XLEN=32). -For RV64 (when XLEN=64), the addresses of all standard high-half CSRs -are reserved, so an attempt to access a high-half CSR -typically raises an illegal-instruction exception.

-
-
-
-
-
-

3. Machine-Level ISA, Version 1.13

-
-
-

This chapter describes the machine-level operations available in -machine-mode (M-mode), which is the highest privilege mode in a RISC-V -hart. M-mode is used for low-level access to a hardware platform and -is the first mode entered at reset. M-mode can also be used to implement -features that are too difficult or expensive to implement in hardware -directly. The RISC-V machine-level ISA contains a common core that is -extended depending on which other privilege levels are supported and -other details of the hardware implementation.

-
-
-

3.1. Machine-Level CSRs

-
-

3.1.1. Machine ISA (misa) Register

-
-

The misa CSR is a WARL read-write register reporting the ISA supported by the hart.

-
-
-
-Diagram -
-
Figure 2. Machine ISA register (misa)
-
-
-

[CVA6] The MXL (Machine XLEN) field encodes the native base integer ISA width as -shown in Table 9. The MXL field is read-only. -In CVA6, the misa register returns the MXL field which indicates the -effective XLEN in M-mode, a constant termed MXLEN.

-
- - ---- - - - - - - - - - - - - -
Table 9. Encoding of MXL field in misa
MXLXLEN

1
-2
-3

32
-64
-128

-
-

The misa CSR is MXLEN bits wide.

-
-
-

[CVA6] The Extensions field encodes the presence of the standard extensions, -with a single bit per letter of the alphabet (bit 0 encodes presence of -extension "A" , bit 1 encodes presence of extension "B", through to -bit 25 which encodes "Z"). The "I" bit will be set for RV32I, RV64I, -and RV128I base ISAs, and the "E" bit will be set for RV32E and RV64E. -In CVA6, the Extensions field is not writeable, the presence of standard -extensions corresponds to the hardware reset value and cannot be modified -by writing in the register.

-
- - ----- - - - - - - - - - - - - - - -
Table 10. Encoding of Extensions field in misa. All bits that are reserved for future use must return zero when read.
BitCharacterDescription

0
-1
-2
-3
-4
-5
-6
-7
-8
-9
-10
-11
-12
-13
-14
-15
-16
-17
-18
-19
-20
-21
-22
-23
-24
-25

A
-B
-C
-D
-E
-F
-G
-H
-I
-J
-K
-L
-M
-N
-O
-P
-Q
-R
-S
-T
-U
-V
-W
-X
-Y
-Z

Atomic extension
-B extension
-Compressed extension
-Double-precision floating-point extension
-RV32E/64E base ISA
-Single-precision floating-point extension
-Reserved
-Hypervisor extension
-RV32I/64I/128I base ISA
-Reserved
-Reserved
-Reserved
-Integer Multiply/Divide extension
-Tentatively reserved for User-Level Interrupts extension
-Reserved
-Tentatively reserved for Packed-SIMD extension
-Quad-precision floating-point extension
-Reserved
-Supervisor mode implemented
-Reserved
-User mode implemented
-Vector extension
-Reserved
-Non-standard extensions present
-Reserved
-Reserved

-
-

The "U" and "S" bits will be set if there is support for user and -supervisor modes respectively.

-
-
-

The "X" bit will be set if there are any non-standard extensions.

-
-
-

When "B" bit is 1, the implementation supports the instructions provided by the -Zba, Zbb, and Zbs extensions. When "B" bit is 0, it indicates that the -implementation may not support one or more of the Zba, Zbb, or Zbs extensions.

-
-
-
-

3.1.2. Machine Vendor ID (mvendorid) Register

-
-

[CVA6] The mvendorid CSR is a 32-bit read-only register providing the JEDEC -manufacturer ID of the provider of the core. -In CVA6, mvendorid is implemented and returns the commercial implementation -id supplied to OpenHW Group organization, 0x602.

-
-
-
-Diagram -
-
Figure 3. Vendor ID register (mvendorid)
-
-
-
-

3.1.3. Machine Architecture ID (marchid) Register

-
-

[CVA6] The marchid CSR is an MXLEN-bit read-only register encoding the base -microarchitecture of the hart. -In CVA6, marchid is implemented and returns the base microarchitecture -of the hart supplied to CVA6, 0x3.

-
-
-
-Diagram -
-
Figure 4. Machine Architecture ID (marchid) register
-
-
-
-

3.1.4. Machine Implementation ID (mimpid) Register

-
-

The mimpid CSR provides a unique encoding of the version of the -processor implementation.

-
-
-

[CVA6] The mimpid register is implemented and the return value is TODO. -The Implementation value should reflect the design of the RISC-V -processor itself and not any surrounding system.

-
-
-
-Diagram -
-
Figure 5. Machine Implementation ID (mimpid) register
-
-
-
-

3.1.5. Hart ID (mhartid) Register

-
-

[CV32A65X] The mhartid CSR is an MXLEN-bit read-only register containing the -integer ID of the hardware thread running the code. This register is -readable. In CV32A65X-based system, only one hart is implemented. -Hart ID is zero.

-
-
-
-Diagram -
-
Figure 6. Hart ID (mhartid) register
-
-
-
-

3.1.6. Machine Status (mstatus and mstatush) Registers

-
-

[CV32A65X] The mstatus register is an MXLEN-bit read/write register formatted as -shown in Figure 7. The mstatus register -keeps track of and controls the hart’s current operating state.

-
-
-
-Diagram -
-
Figure 7. Machine-mode status (mstatus) register for RV32
-
-
-

[CV32A65X] mstatush is a 32-bit read/write register formatted as -shown in Figure 8.

-
-
-
-Diagram -
-
Figure 8. Additional machine-mode status (mstatush) register for RV32.
-
-
-
3.1.6.1. Privilege and Global Interrupt-Enable Stack in mstatus register
-
-

[CV32A65X] As only M-mode is implemented, the instructions and -registers related to less privilege modes are not implemented. -Global interrupt-enable bit, MIE, is provided for M-mode. -This bit is primarily used to guarantee atomicity with respect to -interrupt handlers in the current privilege mode.

-
-
-

[CV32A65X] When a hart is executing in privilege mode M, interrupts are globally -enabled when MIE=1 and globally disabled when MIE=0.

-
-
-

TODO

-
-
-

[CV32A65X] An MRET instruction is used to return from a trap in M-mode. When -executing an MRET instruction, MIE is set to MPIE; MPIE is set to 1; -and MPP keeps M value.

-
-
-

[CV32A65X] Privilege mode S and U are not implemented, MPP field is -WARL field that can hold only privilege mode M, 11 read only, and SPP -and UPP are read-only 0.

-
-
-
-
3.1.6.2. Double Trap Control in mstatus Register
-
-

[CV32A65X] As Double Trap Control (Smdbltrp extension) is not implemented, -MDT field is read-only 0.

-
-
-
-
3.1.6.3. Base ISA Control in mstatus Register
-
-

[CV32A65X] The SXL and UXL fields do not exist.

-
-
-
-
3.1.6.4. Memory Privilege in mstatus Register
-
-

[CV32A65X] As U-Mode is not implemented, the MPRV (Modify PRiVilege) bit is read-only 0. -Loads and stores behave as normal, using the translation and protection -mechanisms of the current privilege mode.

-
-
-

[CV32A65X] As S-mode is not implemented, MXR and SUM are read-only 0.

-
-
-
-
3.1.6.5. Endianness Control in mstatus and mstatush Registers
-
-

The MBE, SBE, and UBE bits in mstatus and mstatush are WARL fields that -control the endianness of memory accesses other than instruction -fetches. Instruction fetches are always little-endian.

-
-
-

MBE controls whether non-instruction-fetch memory accesses made from -M-mode (assuming mstatus.MPRV=0) are little-endian (MBE=0) or -big-endian (MBE=1).

-
-
-

It is always little-endian in M-Mode, the MBE is read-only zero.

-
-
-

S-Mode is not implemented, SBE is read-only 0.

-
-
-

U-Mode is not implemented, UBE is read-only 0.

-
-
-
-
3.1.6.6. Virtualization Support in mstatus Register
-
-

The TVM (Trap Virtual Memory) bit is a WARL field that supports intercepting -supervisor virtual-memory management operations.

-
-
-

S-Mode is not implemented, TVM is read-only 0.

-
-
-

The TW (Timeout Wait) bit is a WARL field that supports intercepting the WFI -instruction (see Section 3.3.3).

-
-
-

TW is read-only 0 because there are no modes less privileged -than M.

-
-
-

The TSR (Trap SRET) bit is a WARL field that supports intercepting the -supervisor exception return instruction, SRET.

-
-
-

As it does not implement S-Mode, TSR is read-only 0.

-
-
-
-
3.1.6.7. Extension Context Status in mstatus Register
-
-

Supporting substantial extensions is one of the primary goals of RISC-V, -and hence we define a standard interface to allow unchanged -privileged-mode code, particularly a supervisor-level OS, to support -arbitrary user-mode state extensions.

-
-
-

[CV32A65X] The FS[1:0] and VS[1:0] WARL fields and the XS[1:0] read-only field are used -to reduce the cost of context save and restore by setting and tracking -the current state of the floating-point unit and any other user-mode -extensions respectively.

-
-
-

As neither the F extension nor S-mode is implemented, then -FS is read-only zero.

-
-
-

As neither the v registers nor S-mode is implemented, then -VS is read-only zero.

-
-
-

As no additional user extensions require new state, the -XS field is read-only zero. TODO

-
-
-

[CV32A65X] The SD bit is a read-only bit that summarizes whether either the FS, VS, -or XS fields signal the presence of some dirty state that will require -saving extended user context to memory.

-
-
-

[CV32A65X] As FS, XS, and VS are all read-only zero, SD is also always -zero.

-
-
-

[CV32A65X] When an extension’s status is set to Off, any instruction that attempts -to read or write the corresponding state will cause an -illegal-instruction exception.

-
-
-
-
3.1.6.8. Previous Expected Landing Pad (ELP) State in mstatus Register
-
-

[CV32A65X] As the Zicfilp extension is not supported, -the SPELP and MPELP fields are read-only zero.

-
-
-
-
-

3.1.7. Machine Trap-Vector Base-Address (mtvec) Register

-
-

The mtvec register is an MXLEN-bit WARL read/write register that holds -trap vector configuration, consisting of a vector base address (BASE) -and a vector mode (MODE).

-
-
-
-Diagram -
-
Figure 9. Encoding of mtvec MODE field.
-
-
-

[CV32A65X] The mtvec register is writable. The value in the BASE field must -always be aligned on a 4-byte boundary. mtvec is always accessed in -Mode=Direct.

-
- - ----- - - - - - - - - - - - - - - -
Table 11. Encoding of mtvec MODE field.
ValueNameDescription

0
-1
-≥2

Direct
-Vectored
----

All traps set pc to BASE.
-Asynchronous interrupts set pc to BASE+4×cause.
-Reserved

-
-

The encoding of the MODE field is shown in -Table 11. When MODE=Direct, all traps into -machine mode cause the pc to be set to the address in the BASE field.

-
-
-
-

3.1.8. Machine Trap Delegation (medeleg and mideleg) Registers

-
-

[CV32A65X] All traps at any privilege level are handled in machine -mode.

-
-
-

[CV32A65X] As S-mode is not implemented, the medeleg and mideleg registers do not exist. -The SPP, SPIE, SIE fields of mstatus are read-only zero.

-
-
-
-

3.1.9. Machine Interrupt (mip and mie) Registers

-
-

The mip register is an MXLEN-bit read/write register containing -information on pending interrupts, while mie is the corresponding -MXLEN-bit read/write register containing interrupt enable bits. -Interrupt cause number i (as reported in CSR mcause, -Section 3.1.15) corresponds with bit i in both mip and -mie. Bits 15:0 are allocated to standard interrupt causes only, while -bits 16 and above are designated for platform use.

-
-
-
-Diagram -
-
Figure 10. Machine Interrupt-Pending (mip) register.
-
-
-
-Diagram -
-
Figure 11. Machine Interrupt-Enable (mie) register
-
-
-

[CV32A65X] As only M-Mode is implemented, an interrupt i will trap -to M-mode (causing the privilege mode to change to M-mode) if all of the -following are true: (a) the MIE bit in the mstatus register is set; -(b) bit i is set in both mip and mie.

-
-
-

[CV32A65X] These conditions for an interrupt trap to occur must be evaluated in a -bounded amount of time from when an interrupt becomes, or ceases to be, -pending in mip, and must also be evaluated immediately following the -execution of an MRET instruction or an explicit write to a CSR on -which these interrupt trap conditions expressly depend (including mip, -mie, mstatus, and mideleg).

-
-
-

[CV32A65X] Each individual bit in register mip is read-only. If interrupt i -can become pending but bit i in mip is read-only, the implementation -must provide some other mechanism for clearing the pending interrupt.

-
-
-

[CV32A65X] TODO: A bit in mie must be writable if the corresponding interrupt can ever -become pending. Bits of mie that are not writable must be read-only -zero.

-
-
-

[CV32A65X] The standard portions (bits 15:0) of registers mip and mie are -formatted as shown in Figure 12 and Figure 13 respectively.

-
-
-
-Diagram -
-
Figure 12. Standard portion (bits 15:0) of mip.
-
-
-
-Diagram -
-
Figure 13. Standard portion (bits 15:0) of mie.
-
-
-

Bits mip.MEIP and mie.MEIE are the interrupt-pending and -interrupt-enable bits for machine-level external interrupts. MEIP is -read-only in mip, and is set and cleared by a platform-specific -interrupt controller.

-
-
-

Bits mip.MTIP and mie.MTIE are the interrupt-pending and -interrupt-enable bits for machine timer interrupts. MTIP is read-only in -mip, and is cleared by writing to the memory-mapped machine-mode timer -compare register.

-
-
-

As the system has only one hart then mip.MSIP and mie.MSIE are -read-only zeros.

-
-
-

As supervisor mode is not implemented, bits SEIP, STIP, and SSIP of -mip and SEIE, STIE, and SSIE of mie are read-only zeros.

-
-
-

As the Sscofpmf extension is not implemented, mip.LCOFIP and mie.LCOFIE are read-only zeros.

-
-
-

Multiple simultaneous interrupts destined for M-mode are handled in the -following decreasing priority order: MEI, MSI, MTI.

-
-
-

As only S-Mode is not implemented, the corresponding bits in sip and sie are read-only zero.

-
-
-
-

3.1.10. Hardware Performance Monitor

-
-

M-mode includes a basic hardware performance-monitoring facility. The -mcycle CSR counts the number of clock cycles executed by the processor -core on which the hart is running. The minstret CSR counts the number -of instructions the hart has retired. The mcycle and minstret -registers have 64-bit precision on all RV32 and RV64 harts.

-
-
-

The counter registers have an arbitrary value after the hart is reset, -and can be written with a given value. Any CSR write takes effect after -the writing instruction has otherwise completed. The mcycle CSR may be -shared between harts on the same core, in which case writes to mcycle -will be visible to those harts. The platform should provide a mechanism -to indicate which harts share an mcycle CSR.

-
-
-

[CV32A65X] The hardware performance monitor includes 29 additional 64-bit event -counters, mhpmcounter3-mhpmcounter31. The event selector CSRs, -mhpmevent3-mhpmevent31, are 64-bit WARL registers that control which -event causes the corresponding counter to increment. The meaning of -these events is defined by the platform, but event 0 is defined to mean -"no event." In CV32A65X all counters are implemented, but both the counter and its corresponding event -selector are read-only 0.

-
-
-
-Diagram -
-
Figure 14. Hardware performance monitor counters.
-
-
-

The mhpmcounters are WARL registers that support up to 64 bits of -precision on RV32 and RV64.

-
-
-

When XLEN=32, reads of the mcycle, minstret, mhpmcountern, and mhpmeventn -CSRs return bits 31-0 of the corresponding register, and writes change -only bits 31-0; reads of the mcycleh, minstreth, and mhpmcounternh -CSRs return bits 63-32 of the corresponding register, and writes change -only bits 63-32.

-
-
-

As the Sscofpmf extension is not implemented, the mhpmeventnh CSRs -are not provided.

-
-
-
-

3.1.11. Machine Counter-Enable (mcounteren) Register

-
-

The counter-enable mcounteren register is a 32-bit register that -controls the availability of the hardware performance-monitoring -counters to the next-lower privileged mode.

-
-
-

[CV32A65X] As U-mode is not implemented, the mcounteren register does not exist.

-
-
-
-

3.1.12. Machine Counter-Inhibit (mcountinhibit) Register

-
-
-Diagram -
-
Figure 15. Counter-inhibit mcountinhibit register
-
-
-

[CV32A65X] The mcountinhibit register is not implemented, the implementation -behaves as though the register were set to zero.

-
-
-
-

3.1.13. Machine Scratch (mscratch) Register

-
-

The mscratch register is an MXLEN-bit read/write register dedicated -for use by machine mode. Typically, it is used to hold a pointer to a -machine-mode hart-local context space and swapped with a user register -upon entry to an M-mode trap handler.

-
-
-
-Diagram -
-
Figure 16. Machine-mode scratch register.
-
-
-
-

3.1.14. Machine Exception Program Counter (mepc) Register

-
-

mepc is an MXLEN-bit read/write register formatted as shown in -Figure 17. The low bit of mepc (mepc[0]) is -always zero.

-
-
-

mepc is a WARL register that must be able to hold all valid virtual -addresses. It need not be capable of holding all possible invalid -addresses. Prior to writing mepc, implementations may convert an -invalid address into some other invalid address that mepc is capable -of holding.

-
-
-

When a trap is taken into M-mode, mepc is written with the virtual -address of the instruction that was interrupted or that encountered the -exception. Otherwise, mepc is never written by the implementation, -though it may be explicitly written by software.

-
-
-
-Diagram -
-
Figure 17. Machine exception program counter register.
-
-
-
-

3.1.15. Machine Cause (mcause) Register

-
-

The mcause register is an MXLEN-bit read-write register formatted as -shown in Figure 18. When a trap is taken into -M-mode, mcause is written with a code indicating the event that -caused the trap. Otherwise, mcause is never written by the -implementation, though it may be explicitly written by software.

-
-
-

The Interrupt bit in the mcause register is set if the trap was caused -by an interrupt. The Exception Code field contains a code identifying -the last exception or interrupt. Table 12 lists -the possible machine-level exception codes. The Exception Code is a -WLRL field, so is only guaranteed to hold supported exception codes.

-
-
-
-Diagram -
-
Figure 18. Machine Cause (mcause) register.
-
-
-

[CV32A65X] Note that load and load-reserved instructions generate load exceptions, -whereas store and store-conditional instructions generate -store exceptions.

-
-
-

[CVA6] If an instruction may raise multiple synchronous exceptions, the -decreasing priority order of -Table 13 indicates which -exception is taken and reported in mcause. The priority of any custom -synchronous exceptions is implementation-defined. TODO

-
-
- - ----- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 12. Machine cause (mcause) register values after trap.
InterruptException CodeDescription

1
-1
-1
-1

0
-1
-2
-3

Reserved
-Supervisor software interrupt
-Reserved
-Machine software interrupt

1
-1
-1
-1

4
-5
-6
-7

Reserved
-Supervisor timer interrupt
-Reserved
-Machine timer interrupt

1
-1
-1
-1

8
-9
-10
-11

Reserved
-Supervisor external interrupt
-Reserved
-Machine external interrupt

1
-1
-1
-1

12
-13
-14-15
-≥16

Reserved
-Counter-overflow interrupt
-Reserved
-Designated for platform use

0
-0
-0
-0
-0
-0
-0
-0
-0
-0
-0
-0
-0
-0
-0
-0
-0
-0
-0
-0
-0
-0

0
-1
-2
-3
-4
-5
-6
-7
-8
-9
-10
-11
-12
-13
-14
-15
-16
-17
-18
-19
-20-23
-24-31
-32-47
-48-63
-≥64

Instruction address misaligned
-Instruction access fault
-Illegal instruction
-Breakpoint
-Load address misaligned
-Load access fault
-Store/AMO address misaligned
-Store/AMO access fault
-Environment call from U-mode
-Environment call from S-mode
-Reserved
-Environment call from M-mode
-Instruction page fault
-Load page fault
-Reserved
-Store/AMO page fault
-Double trap
-Reserved
-Software check
-Hardware error
-Reserved
-Designated for custom use
-Reserved
-Designated for custom use
-Reserved

-
- - ----- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 13. Synchronous exception priority in decreasing priority order.
PriorityExc.CodeDescription

Highest

3

Instruction address breakpoint

12, 1

During instruction address translation:
-First encountered page fault or access fault

1

With physical address for instruction:
-Instruction access fault

2
-0
-8,9,11
-3
-3

Illegal instruction
-Instruction address misaligned
-Environment call
-Environment break
-Load/store/AMO address breakpoint

4,6

Optionally:
-Load/store/AMO address misaligned

13, 15, 5, 7

During address translation for an explicit memory access:
-First encountered page fault or access fault

5,7

With physical address for an explicit memory access:
-Load/store/AMO access fault

Lowest

4,6

If not higher priority:
-Load/store/AMO address misaligned

-
-

[CV32A65X] Load/store address-misaligned exceptions may have either higher or -lower priority than load/store access-fault -exceptions. TODO

-
-
-
-

3.1.16. Machine Trap Value (mtval) Register

-
-

[CV32A65X] The mtval register is an MXLEN-bit read-write register -holding constant value zero.

-
-
-
-

3.1.17. Machine Configuration Pointer (mconfigptr) Register

-
-

The mconfigptr register is an MXLEN-bit read-only CSR that holds the physical -address of a configuration data structure.

-
-
-

[CV32A65X] The mconfigptr register is implemented, but it is read-only 0 to indicate the -configuration data structure does not exist.

-
-
-
-

3.1.18. Machine Environment Configuration (menvcfg) Register

-
-

The menvcfg CSR is a 64-bit read/write register that controls -certain characteristics of the execution environment for modes less -privileged than M.

-
-
-

[CV32A65X] As XLEN=32, menvcfgh is a 32-bit read/write register -that aliases bits 63:32 of menvcfg.

-
-
-

[CV32A65X] As U-mode is not supported, then registers menvcfg and menvcfgh do -not exist.

-
-
-
-

3.1.19. Machine Security Configuration (mseccfg) Register

-
-

mseccfg is an optional 64-bit read/write register, -that controls security features.

-
-
-

As XLEN=32, mseccfgh is a 32-bit read/write register -that aliases bits 63:32 of mseccfg.

-
-
-

[CV32A65X] As Zkr, Smepmp, and Smmpm extensions are not implemented, -mseccfg and mseccfgh do not exist. TODO.

-
-
-
-
-

3.2. Machine-Level Memory-Mapped Registers

-
-

3.2.1. Machine Timer (mtime and mtimecmp) Registers

-
-

Platforms provide a real-time counter, exposed as a memory-mapped -machine-mode read-write register, mtime. mtime must increment at -constant frequency, and the platform must provide a mechanism for -determining the period of an mtime tick. The mtime register will -wrap around if the count overflows.

-
-
-

The mtime register has a 64-bit precision on all RV32 and RV64 -systems. Platforms provide a 64-bit memory-mapped machine-mode timer -compare register (mtimecmp). A machine timer interrupt becomes pending -whenever mtime contains a value greater than or equal to mtimecmp, -treating the values as unsigned integers. The interrupt remains posted -until mtimecmp becomes greater than mtime (typically as a result of -writing mtimecmp). The interrupt will only be taken if interrupts are -enabled and the MTIE bit is set in the mie register.

-
-
-
-Diagram -
-
Figure 19. Machine time register (memory-mapped control register).
-
-
-
-Diagram -
-
Figure 20. Machine time compare register (memory-mapped control register).
-
-
-

If the result of the comparison between mtime and mtimecmp changes, it is -guaranteed to be reflected in MTIP eventually, but not necessarily -immediately.

-
-
-

In RV32, memory-mapped writes to mtimecmp modify only one 32-bit part -of the register. The following code sequence sets a 64-bit mtimecmp -value without spuriously generating a timer interrupt due to the -intermediate value of the comparand:

-
-
-
Sample code for setting the 64-bit time comparand in RV32 assuming a little-endian memory system and that the registers live in a strongly ordered I/O region. Storing -1 to the low-order bits of mtimecmp prevents mtimecmp from temporarily becoming smaller than the lesser of the old and new values.
-
-
            # New comparand is in a1:a0.
-            li t0, -1
-            la t1, mtimecmp
-            sw t0, 0(t1)     # No smaller than old value.
-            sw a1, 4(t1)     # No smaller than new value.
-            sw a0, 0(t1)     # New value.
-
-
-
-
-
-

3.3. Machine-Mode Privileged Instructions

-
-

3.3.1. Environment Call and Breakpoint

-
-
-Diagram -
-
-
-

[CV32A65X] The ECALL instruction is used to make a request to the supporting -execution environment. When executed M-mode, it -generates an environment-call-from-M-mode -exception, and performs no other operation.

-
-
-

The EBREAK instruction is used by debuggers to cause control to be -transferred back to a debugging environment. -Unless overridden by an external debug environment, EBREAK raises -a breakpoint exception and performs no other operation.

-
-
-

ECALL and EBREAK cause the receiving privilege mode’s epc register to -be set to the address of the ECALL or EBREAK instruction itself, not -the address of the following instruction. As ECALL and EBREAK cause -synchronous exceptions, they are not considered to retire, and should -not increment the minstret CSR.

-
-
-
-

3.3.2. Trap-Return Instructions

-
-

Instructions to return from trap are encoded under the PRIV minor -opcode.

-
-
-
-Diagram -
-
-
-

[CV32A65X] To return after handling a trap, there are separate trap return -instructions per privilege level, MRET and SRET. MRET is always -provided. In CV32A65X, SRET is not provided as supervisor mode is not supported, and -raises an illegal-instruction exception. In addition to manipulating -the privilege stack as described in Section 3.1.6.1, -MRET sets the pc to the value stored in the mepc register.

-
-
-
-

3.3.3. Wait for Interrupt

-
-

[CV32A65X] The Wait for Interrupt instruction (WFI) informs the -implementation that the current hart can be stalled until an interrupt -might need servicing. This instruction -cannot raise an illegal-instruction exception because TW=0 in mstatus, as -described in Section 3.1.6.6.

-
-
-
-Diagram -
-
-
-

If an enabled interrupt is present or later becomes present while the -hart is stalled, the interrupt trap will be taken on the following -instruction, i.e., execution resumes in the trap handler and mepc = -pc + 4.

-
-
-

Implementations are permitted to resume execution for any reason, even if an -enabled interrupt has not become pending. Hence, a legal implementation is to -simply implement the WFI instruction as a NOP.

-
-
-

[CV32A65X] The WFI instruction can also be executed when interrupts are disabled. -The operation of WFI must be unaffected by the global interrupt bits in -mstatus (MIE), but should -honor the individual interrupt enables (e.g, MTIE) (i.e., -implementations should avoid resuming the hart if the interrupt is -pending but not individually enabled). WFI is also required to resume -execution for locally enabled interrupts pending, -regardless of the global interrupt enable.

-
-
-

If the event that causes the hart to resume execution does not cause an -interrupt to be taken, execution will resume at pc + 4, and software -must determine what action to take, including looping back to repeat the -WFI if there was no actionable event.

-
-
-
-

3.3.4. Custom SYSTEM Instructions

-
-

The subspace of the SYSTEM major opcode shown in Figure 21 is designated for custom use. It is recommended that these instructions use bits 29:28 to designate the -minimum required privilege mode, as do other SYSTEM instructions.

-
-
-
-Diagram -
-
Figure 21. SYSTEM instruction encodings designated for custom use.
-
-
-
-
-

3.4. Reset

-
-

[CV32A65X] Privilege mode is always M. -As little-endian memory accesses are supported, -the mstatus/mstatush field MBE is reset to 0. -Upon reset, the mstatus fields MIE and MPRV are reset to 0. -The misa register is set as described in Section 3.1.1. -The pc is set to 0x80000000 reset vector. TODO -The mcause register is set to a value indicating the cause of the reset. -Writable PMP registers’ A and L fields are set to 0. -No WARL field contains an illegal value. All other hart state is UNSPECIFIED.

-
-
-

As CV32A65X does not distinguished different reset conditions, -The mcause returns 0 after reset.

-
-
-
-

3.5. Non-Maskable Interrupts

-
-

Non-maskable interrupts (NMIs) are only used for hardware error -conditions, and cause an immediate jump to an implementation-defined NMI -vector running in M-mode regardless of the state of a hart’s interrupt -enable bits. The mepc register is written with the virtual address of -the instruction that was interrupted, and mcause is set to a value -indicating the source of the NMI. The NMI can thus overwrite state in an -active machine-mode interrupt handler.

-
-
-

[CV32A65X] Upon NMI, the high Interrupt bit of mcause is set to indicate -that this was an interrupt. As CV32A65X does not distinguish sources -of NMIs, the mcause register returns 0 in the Exception Code.

-
-
-

Unlike resets, NMIs do not reset processor state, enabling diagnosis, -reporting, and possible containment of the hardware error.

-
-
-
-

3.6. Physical Memory Attributes

-
-

The physical memory map for a complete system includes various address -ranges, some corresponding to memory regions and some to memory-mapped -control registers, portions of which might not be accessible. Some -memory regions might not support reads, writes, or execution; some might -not support subword or subblock accesses; some might not support atomic -operations; and some might not support cache coherence or might have -different memory models. Similarly, memory-mapped control registers vary -in their supported access widths, support for atomic operations, and -whether read and write accesses have associated side effects. In RISC-V -systems, these properties and capabilities of each region of the -machine’s physical address space are termed physical memory attributes -(PMAs). This section describes RISC-V PMA terminology and how RISC-V -systems implement and check PMAs.

-
-
-

[CV32A65X] PMAs are inherent properties of the underlying hardware. The PMAs of -some memory regions are fixed at chip design time.

-
-
-

[CV32A65X] Some PMAs are dynamically -checked in hardware later in the execution pipeline after the physical -address is known, as some operations will not be supported at all -physical memory addresses, and some operations require knowing the -setting of a PMA attribute.

-
-
-

[CV32A65X] For RISC-V, we separate out specification and checking of PMAs into a -separate hardware structure, the PMA checker. In CV32A65X, the -attributes are known at system design time for each physical address -region, and are hardwired into the PMA checker. -PMAs are checked for any access to physical memory, including accesses -that have undergone virtual to physical memory translation. To aid in -system debugging, we strongly recommend that, where possible, RISC-V -processors precisely trap physical memory accesses that fail PMA checks. -Precisely trapped PMA violations manifest as instruction, load, or store -access-fault exceptions, distinct from virtual-memory page-fault -exceptions. Precise PMA traps might not always be possible, for example, -when probing a legacy bus architecture that uses access failures as part -of the discovery mechanism. In this case, error responses from -peripheral devices will be reported as imprecise bus-error interrupts.

-
-
-

[CV32A65X] PMAs are not readable by software.

-
-
-

3.6.1. Main Memory versus I/O Regions

-
-

The most important characterization of a given memory address range is -whether it holds regular main memory or I/O devices. -Regular main memory is required to have a number of properties, -specified below, whereas I/O devices can have a much broader range of -attributes. Memory regions that do not fit into regular main memory, for -example, device scratchpad RAMs, are categorized as I/O regions.

-
-
- - - - - -
- - -What previous versions of this specification termed vacant regions are -no longer a distinct category; they are now described as I/O regions that are -not accessible (i.e. lacking read, write, and execute permissions). -Main memory regions that are not accessible are also allowed. -
-
-
-
-

3.6.2. Supported Access Type PMAs

-
-

Access types specify which access widths, from 8-bit byte to long -multi-word burst, are supported, and also whether misaligned accesses -are supported for each access width.

-
-
-

Main memory regions always support read and write of all access widths -required by the attached devices, and can specify whether instruction -fetch is supported.

-
-
-

I/O regions can specify which combinations of read, write, or execute -accesses to which data widths are supported.

-
-
-

For systems with page-based virtual memory, I/O and memory regions can -specify which combinations of hardware page-table reads and hardware -page-table writes are supported.

-
-
-
-

3.6.3. Atomicity PMAs

-
-

[CV32A65X] Atomic extension is not implemented.

-
-
-
3.6.3.1. AMO PMA
-
-

[CV32A65X] Atomic extension is not implemented.

-
-
-
-
3.6.3.2. Reservability PMA
-
-

[CV32A65X] Atomic extension is not implemented.

-
-
-
-
-

3.6.4. Misaligned Atomicity Granule PMA

-
-

[CV32A65X] Atomic extension is not implemented.

-
-
-
-

3.6.5. Memory-Ordering PMAs

-
-

[CV32A65X] As CV32A65X is dedicated to a one hart -platform without any DMA, no memory-ordering mechanism is implemented.

-
-
-
-

3.6.6. Coherence and Cacheability PMAs

-
-

[CV32A65X] Write accesses are not cached. No cache-coherence scheme -is implemented.

-
-
-

If a PMA indicates non-cacheability, then accesses to that region must -be satisfied by the memory itself, not by any caches.

-
-
-
-

3.6.7. Idempotency PMAs

-
-

Idempotency PMAs describe whether reads and writes to an address region -are idempotent. Main memory regions are assumed to be idempotent. For -I/O regions, idempotency on reads and writes can be specified separately -(e.g., reads are idempotent but writes are not). If accesses are -non-idempotent, i.e., there is potentially a side effect on any read or -write access, then speculative or redundant accesses must be avoided.

-
-
-

For the purposes of defining the idempotency PMAs, changes in observed -memory ordering created by redundant accesses are not considered a side -effect.

-
-
-

For non-idempotent regions, implicit reads and writes must not be -performed early or speculatively, with the following exceptions. When a -non-speculative implicit read is performed, an implementation is -permitted to additionally read any of the bytes within a naturally -aligned power-of-2 region containing the address of the non-speculative -implicit read. Furthermore, when a non-speculative instruction fetch is -performed, an implementation is permitted to additionally read any of -the bytes within the next naturally aligned power-of-2 region of the -same size (with the address of the region taken modulo -2XLEN. The results of these additional reads -may be used to satisfy subsequent early or speculative implicit reads. -The size of these naturally aligned power-of-2 regions is -implementation-defined, but, for systems with page-based virtual memory, -must not exceed the smallest supported page size.

-
-
-
-
-

3.7. Physical Memory Protection

-
-

To support secure processing and contain faults, it is desirable to -limit the physical addresses accessible by software running on a hart. -An optional physical memory protection (PMP) unit provides per-hart -machine-mode control registers to allow physical memory access -privileges (read, write, execute) to be specified for each physical -memory region. The PMP values are checked in parallel with the PMA -checks described in Section 3.6.

-
-
-

[CV32A65X] The granularity of PMP access control settings are as small as four bytes.

-
-
-

[CV32A65X] PMP checks applies to M-mode accesses, in which case the PMP registers -themselves are locked, so that even M-mode software cannot change them -until the hart is reset. In effect, PMP can revoke permissions from -M-mode, which by default has full permissions.

-
-
-

PMP violations are always trapped precisely at the processor.

-
-
-

3.7.1. Physical Memory Protection CSRs

-
-

PMP entries are described by an 8-bit configuration register and one -MXLEN-bit address register. Some PMP settings additionally use the -address register associated with the preceding PMP entry. 64 PMP -entries are implemented. The lowest-numbered PMP entries must be -implemented first. All PMP CSR fields are WARL and 56 upper entries are -read-only zero. PMP CSRs are only accessible to M-mode.

-
-
-

[CV32A65X] The PMP configuration registers are densely packed into CSRs to minimize -context-switch time. For CV32A65X with sixty four CSRs, pmpcfg0pmpcfg15, hold -the configurations as shown -in Figure 22. -The 14 upper entries are read-only zero.

-
-
-
-Diagram -
-
Figure 22. RV32 PMP configuration CSR layout.
-
-
-

[CV32A65X] The PMP address registers are CSRs named pmpaddr0-pmpaddr63. Each -PMP address register encodes bits 33-2 of a 34-bit physical address for -RV32, as shown in Figure 23. Not all -physical address bits may be implemented, and so the pmpaddr registers -are WARL.

-
-
-
-Diagram -
-
Figure 23. PMP address register format, RV32.
-
-
-

Figure 24 shows the layout of a PMP configuration -register. The R, W, and X bits, when set, indicate that the PMP entry -permits read, write, and instruction execution, respectively. When one -of these bits is clear, the corresponding access type is denied. The R, -W, and X fields form a collective WARL field for which the combinations with R=0 and W=1 are reserved. The remaining two fields, A and L, are described in the following sections.

-
-
-
-Diagram -
-
Figure 24. PMP configuration register format.
-
-
-

Attempting to fetch an instruction from a PMP region that does not have -execute permissions raises an instruction access-fault exception. -Attempting to execute a load or load-reserved instruction which accesses -a physical address within a PMP region without read permissions raises a -load access-fault exception. Attempting to execute a store, -store-conditional, or AMO instruction which accesses a physical address -within a PMP region without write permissions raises a store -access-fault exception.

-
-
-
3.7.1.1. Address Matching
-
-

The A field in a PMP entry’s configuration register encodes the -address-matching mode of the associated PMP address register. The -encoding of this field is shown in Table 14.

-
-
-

When A=0, this PMP entry is disabled and matches no addresses. Two other -address-matching modes are supported: naturally aligned power-of-2 -regions (NAPOT), including the special case of naturally aligned -four-byte regions (NA4); and the top boundary of an arbitrary range -(TOR). These modes support four-byte granularity.

-
-
-

[CV32A65X] Two address-matching modes are supported: disabled and TOR.

-
- - ----- - - - - - - - - - - - - - - -
Table 14. Encoding of A field in PMP configuration registers.
ANameDescription

0
-1
-2
-3

OFF
-TOR
-NA4
-NAPOT

Null region (disabled)
-Top of range
-Not supported
-Not supported

-
-

If TOR is selected, the associated address register forms the top of the -address range, and the preceding PMP address register forms the bottom -of the address range. If PMP entry i's A field is set to -TOR, the entry matches any address y such that pmpaddri-1y<pmpaddri (irrespective of the value of pmpcfgi-1). If PMP entry 0’s A field is set to TOR, zero is used for the lower bound, and so it matches -any address y<pmpaddr0.

-
-
-

[CV32A65X] Although the PMP mechanism supports regions as small as four bytes, -platforms may specify coarser PMP regions. In general, the PMP grain is -stem 50c2d22972bd4d3042e2106e11a4f768 bytes and must be the same across all PMP regions. -When stem 455f095ec98c57486370d8897c063d21 and -stem a0acc94b70acb4e7dd3c8c0039ee033e.A[1] is clear, i.e. the mode is OFF or TOR, -then bits stem 7fb2b94c3cf4f2fa76e8b8950724e8d0[G-1:0] read as all zeros. Bits -stem 7fb2b94c3cf4f2fa76e8b8950724e8d0[G-1:0] do not affect the TOR address-matching -logic.

-
-
-

If the current XLEN is greater than MXLEN, the PMP address registers are -zero-extended from MXLEN to XLEN bits for the purposes of address -matching.

-
-
-
-
3.7.1.2. Locking and Privilege Mode
-
-

The L bit indicates that the PMP entry is locked, i.e., writes to the -configuration register and associated address registers are ignored. -Locked PMP entries remain locked until the hart is reset. If PMP entry -i is locked, writes to pmpicfg and pmpaddri are ignored. Additionally, if PMP -entry i is locked and pmpicfg.A is set -to TOR, writes to pmpaddri-1 are ignored.

-
-
-

[CV32A65X] In addition to locking the PMP entry, the L bit indicates whether the -R/W/X permissions are enforced on M-mode accesses. When the L -bit is clear, any M-mode access matching the PMP entry will succeed.

-
-
-
-
3.7.1.3. Priority and Matching Logic
-
-

PMP entries are statically prioritized. The lowest-numbered PMP entry -that matches any byte of an access determines whether that access -succeeds or fails. The matching PMP entry must match all bytes of an -access, or the access fails, irrespective of the L, R, W, and X bits. -For example, if a PMP entry is configured to match the four-byte range -0xC0xF, then an 8-byte access to the range 0x80xF will fail, -assuming that PMP entry is the highest-priority entry that matches those -addresses.

-
-
-

If a PMP entry matches all bytes of an access, then the L, R, W, and X -bits determine whether the access succeeds or fails. If the L bit is -clear and the privilege mode of the access is M, the access succeeds.

-
-
-

[CV32A65X] If no PMP entry matches an M-mode access, the access succeeds.

-
-
-

On some implementations, misaligned loads, stores, and instruction -fetches may also be decomposed into multiple accesses, some of which may -succeed before an access-fault exception occurs. In particular, a -portion of a misaligned store that passes the PMP check may become -visible, even if another portion fails the PMP check. The same behavior -may manifest for stores wider than XLEN bits (e.g., the FSD instruction -in RV32D), even when the store address is naturally aligned.

-
-
-
-
-

3.7.2. Physical Memory Protection and Paging

-
-

[CV32A65X] As page-based virtual memory systems is not implemented, memory accesses -check the PMP settings synchronously.

-
-
-
-
-
-
-

4. "Smstateen/Ssstateen" Extensions, Version 1.0

-
-
-

CV32A65X: This extension is not supported.

-
-
-
-
-

5. "Smcsrind/Sscsrind" Indirect CSR Access, Version 1.0

-
-
-

CV32A65X: This extension is not supported.

-
-
-
-
-

6. "Smepmp" Extension for PMP Enhancements for memory access and execution prevention in Machine mode, Version 1.0

-
-
-

CV32A65X: This extension is not supported.

-
-
-
-
-

7. "Smcntrpmf" Cycle and Instret Privilege Mode Filtering, Version 1.0

-
-
-

CV32A65X: This extension is not supported.

-
-
-
-
-

8. "Smrnmi" Extension for Resumable Non-Maskable Interrupts, Version 0.5

-
-
-

CV32A65X: This extension is not supported.

-
-
-
-
-

9. "Smcdeleg" Counter Delegation Extension, Version 1.0

-
-
-

CV32A65X: This extension is not supported.

-
-
-
-
-

10. "Smdbltrp" Double Trap Extension, Version 1.0

-
-
-

CV32A65X: This extension is not supported.

-
-
-
-
-

11. Supervisor-Level ISA, Version 1.13

-
-
-

CV32A65X: This extension is not supported.

-
-
-
-
-

12. "Sstc" Extension for Supervisor-mode Timer Interrupts, Version 1.0

-
-
-

CV32A65X: This extension is not supported.

-
-
-
-
-

13. "Sscofpmf" Extension for Count Overflow and Mode-Based Filtering, Version 1.0

-
-
-

CV32A65X: This extension is not supported.

-
-
-
-
-

14. "H" Extension for Hypervisor Support, Version 1.0

-
-
-

CV32A65X: This extension is not supported.

-
-
-
-
-

15. Control-flow Integrity (CFI)

-
-
-

CV32A65X: The Zicfiss extension is not supported.

-
-
-

CV32A65X: The Zicfilp extension is not supported.

-
-
-
-
-

16. "Ssdbltrp" Double Trap Extension, Version 1.0

-
-
-

CV32A65X: This extension is not supported.

-
-
-
-
-

17. RISC-V Privileged Instruction Set Listings

-
-
-

This chapter presents instruction-set listings for all instructions -defined in the RISC-V Privileged Architecture.

-
-
-

The instruction-set listings for unprivileged instructions, including -the ECALL and EBREAK instructions, are provided in Volume I of this -manual.

-
-
-
-Diagram -
-
Figure 25. RISC-V Privileged Instructions
-
-
-
-
-

18. History

-
-
-

18.1. Research Funding at UC Berkeley

-
-

Development of the RISC-V architecture and implementations has been -partially funded by the following sponsors.

-
-
-
    -
  • -

    Par Lab: Research supported by Microsoft (Award #024263) and Intel -(Award #024894) funding and by matching funding by U.C. Discovery (Award -#DIG07-10227). Additional support came from Par Lab affiliates Nokia, -NVIDIA, Oracle, and Samsung.

    -
  • -
  • -

    Project Isis: DoE Award DE-SC0003624.

    -
  • -
  • -

    ASPIRE Lab: DARPA PERFECT program, Award HR0011-12-2-0016. DARPA -POEM program Award HR0011-11-C-0100. The Center for Future Architectures -Research (C-FAR), a STARnet center funded by the Semiconductor Research -Corporation. Additional support from ASPIRE industrial sponsor, Intel, -and ASPIRE affiliates, Google, Huawei, Nokia, NVIDIA, Oracle, and -Samsung.

    -
  • -
-
-
-

The content of this paper does not necessarily reflect the position or -the policy of the US government and no official endorsement should be -inferred.

-
-
-
-
-
-

Bibliography

-
-
-

Goldberg, R. P. (1974). Survey of virtual machine research. Computer, 7(6), 34–45.

-
-
-
-
- - - \ No newline at end of file diff --git a/docs/04_cv32a65x/riscv/src/config.adoc b/docs/04_cv32a65x/riscv/src/config.adoc deleted file mode 100644 index c41149b11e..0000000000 --- a/docs/04_cv32a65x/riscv/src/config.adoc +++ /dev/null @@ -1,49 +0,0 @@ -:ohg-config: CV32A65X - -:XLEN: 32 -:RVA: false -:RVC: true -:RVD: false -:RVE: false -:RVF: false -:RVH: false -:RVQ: false -:RVS: false -:RVU: false -:RVV: false -:RVZabha: false -:RVZacas: false -:RVZawrs: false -:RVZfa: false -:RVZfbf-RZvfbf: false -:RVZfh: false -:RVZfinx: false -:RVZicbo: false -:RVZicfilp: false -:RVZicfiss: false -:RVZicond: false -:RVZifencei: false -:RVZihintntl: false -:RVZihintpause: false -:RVZihpm: false -:RVZimop: false -:RVZk: false -:RVZsmcdeleg: false -:RVZsmcntrpmf: false -:RVZsmcsrind-RVZsscsrind: false -:RVZsmdbltrp: false -:RVZsmepmp: false -:RVZsmmpm: false -:RVZsmrnmi: false -:RVZsmstateen: false -:RVZsscofpmf: false -:RVZssdbltrp: false -:RVZsstc: false -:RVZtso: false -:RVZvk: false -:SV: SV0 - -:DCacheEn: false -:MTvalEn: false -:MTvecDirectEn: true -:note: false diff --git a/docs/04_cv32a65x/riscv/unpriv-isa-cv32a65x.html b/docs/04_cv32a65x/riscv/unpriv-isa-cv32a65x.html deleted file mode 100644 index 811ff5e2c8..0000000000 --- a/docs/04_cv32a65x/riscv/unpriv-isa-cv32a65x.html +++ /dev/null @@ -1,26352 +0,0 @@ - - - - - - - - -The RISC-V Instruction Set Manual for CV32A65X: Volume I - Unprivileged Architecture - - - - - - -
-
-
-
-

This document describes the RISC-V unprivileged architecture tailored for -OpenHW Group CV32A65X. -Not relevant parts (e.g. unsupported extensions) of the original -specification are replaced by placeholders.

-
-
-

Contributors to all versions of the spec in alphabetical order (please contact editors to suggest -corrections): Derek Atkins, -Arvind, -Krste Asanović, -Rimas Avižienis, -Jacob Bachmeyer, -Christopher F. Batten, -Allen J. Baum, -Abel Bernabeu, -Alex Bradbury, -Scott Beamer, -Hans Boehm, -Preston Briggs, -Christopher Celio, -Chuanhua Chang, -David Chisnall, -Paul Clayton, -Palmer Dabbelt, -L Peter Deutsch, -Ken Dockser, -Paul Donahue, -Aaron Durbin, -Roger Espasa, -Greg Favor, -Andy Glew, -Shaked Flur, -Stefan Freudenberger, -Marc Gauthier, -Andy Glew, -Jan Gray, -Gianluca Guida, -Michael Hamburg, -John Hauser, -John Ingalls, -David Horner, -Bruce Hoult, -Bill Huffman, -Alexandre Joannou, -Olof Johansson, -Ben Keller, -David Kruckemyer, -Tariq Kurd, -Yunsup Lee, -Paul Loewenstein, -Daniel Lustig, -Yatin Manerkar, -Luc Maranget, -Ben Marshall, -Margaret Martonosi, -Phil McCoy, -Nathan Menhorn, -Christoph Müllner, -Joseph Myers, -Vijayanand Nagarajan, -Rishiyur Nikhil, -Jonas Oberhauser, -Stefan O’Rear, -Markku-Juhani O. Saarinen, -Albert Ou, -John Ousterhout, -Daniel Page, -David Patterson, -Christopher Pulte, -Jose Renau, -Josh Scheid, -Colin Schmidt, -Peter Sewell, -Susmit Sarkar, -Ved Shanbhogue, -Brent Spinney, -Brendan Sweeney, -Michael Taylor, -Wesley Terpstra, -Matt Thomas, -Tommy Thorn, -Philipp Tomsich, -Caroline Trippel, -Ray VanDeWalker, -Muralidaran Vijayaraghavan, -Megan Wachs, -Paul Wamsley, -Andrew Waterman, -Robert Watson, -David Weaver, -Derek Williams, -Claire Wolf, -Andrew Wright, -Reinoud Zandijk, -and Sizhuo Zhang.

-
-
-

This document is released under a Creative Commons Attribution 4.0 International License.

-
-
-

This document is a derivative of “The RISC-V Instruction Set Manual, Volume I: User-Level ISA -Version 2.1” released under the following license: ©2010-2017 Andrew Waterman, Yunsup Lee, -David Patterson, Krste Asanović. Creative Commons Attribution 4.0 International License. -Please cite as: “The RISC-V Instruction Set Manual, Volume I: User-Level ISA, Document -Version 20191214-draft”, Editors Andrew Waterman and Krste Asanović, RISC-V Foundation, -December 2019.

-
-
-

Contributors to CV32A65X versions of the spec in alphabetical order: -Jean-Roch Coulon, André Sintzoff.

-
-
-
-
-

Preface

-
-
-

Preface to Document Version for CV32A65X

-
-
-

This document describes the RISC-V unprivileged architecture tailored for -OpenHW Group CV32A65X.

-
-
-

Preface to Document Version 20240703

-
-
-

This document describes the RISC-V unprivileged architecture.

-
-
-

The ISA modules marked Ratified have been ratified at this time. The -modules marked Frozen are not expected to change significantly before -being put up for ratification. The modules marked Draft are expected -to change before ratification.

-
-
-

The document contains the following versions of the RISC-V ISA modules:

-
- ----- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
BaseVersionStatus

RV32I

2.1

Ratified

RV32E

2.0

Ratified

RV64E

2.0

Ratified

RV64I

2.1

Ratified

RV128I

1.7

Draft

Extension

Version

Status

Zifencei

2.0

Ratified

Zicsr

2.0

Ratified

Zicntr

2.0

Ratified

Zihintntl

1.0

Ratified

Zihintpause

2.0

Ratified

Zimop

1.0

Ratified

Zicond

1.0

Ratified

M

2.0

Ratified

Zmmul

1.0

Ratified

A

2.1

Ratified

Zawrs

1.01

Ratified

Zacas

1.0

Ratifed

Zabha

1.0

Ratifed

RVWMO

2.0

Ratified

Ztso

1.0

Ratified

CMO

1.0

Ratified

F

2.2

Ratified

D

2.2

Ratified

Q

2.2

Ratified

Zfh

1.0

Ratified

Zfhmin

1.0

Ratified

Zfa

1.0

Ratified

Zfinx

1.0

Ratified

Zdinx

1.0

Ratified

Zhinx

1.0

Ratified

Zhinxmin

1.0

Ratified

C

2.0

Ratified

Zce

1.0

Ratified

B

1.0

Ratified

P

0.2

Draft

V

1.0

Ratified

Zbkb

1.0

Ratified

Zbkc

1.0

Ratified

Zbkx

1.0

Ratified

Zk

1.0

Ratified

Zks

1.0

Ratified

Zvbb

1.0

Ratified

Zvbc

1.0

Ratified

Zvkg

1.0

Ratified

Zvkned

1.0

Ratified

Zvknhb

1.0

Ratified

Zvksed

1.0

Ratified

Zvksh

1.0

Ratified

Zvkt

1.0

Ratified

Zicfiss

1.0

Ratified

Zicfilp

1.0

Ratified

-
-

The changes in this version of the document include:

-
-
-
    -
  • -

    The inclusion of all ratified extensions through March 2024.

    -
  • -
  • -

    The draft Zam extension has been removed, in favor of the definition of a misaligned atomicity granule PMA.

    -
  • -
  • -

    The concept of vacant memory regions has been superseded by inaccessible memory or I/O regions.

    -
  • -
-
-
-

Preface to Document Version 20191213-Base-Ratified

-
-
-

This document describes the RISC-V unprivileged architecture.

-
-
-

The ISA modules marked Ratified have been ratified at this time. The -modules marked Frozen are not expected to change significantly before -being put up for ratification. The modules marked Draft are expected -to change before ratification.

-
-
-

The document contains the following versions of the RISC-V ISA modules:

-
- ----- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
BaseVersionStatus

RVWMO

2.0

Ratified

RV32I

2.1

Ratified

RV64I

2.1

Ratified

RV32E

1.9

Draft

RV128I

1.7

Draft

Extension

Version

Status

M

2.0

Ratified

A

2.1

Ratified

F

2.2

Ratified

D

2.2

Ratified

Q

2.2

Ratified

C

2.0

Ratified

Counters

2.0

Draft

L

0.0

Draft

B

0.0

Draft

J

0.0

Draft

T

0.0

Draft

P

0.2

Draft

V

0.7

Draft

Zicsr

2.0

Ratified

Zifencei

2.0

Ratified

Zam

0.1

Draft

Ztso

0.1

Frozen

-
-

The changes in this version of the document include:

-
-
-
    -
  • -

    The A extension, now version 2.1, was ratified by the board in -December 2019.

    -
  • -
  • -

    Defined big-endian ISA variant.

    -
  • -
  • -

    Moved N extension for user-mode interrupts into Volume II.

    -
  • -
  • -

    Defined PAUSE hint instruction.

    -
  • -
-
-
-

Preface to Document Version 20190608-Base-Ratified

-
-
-

This document describes the RISC-V unprivileged architecture.

-
-
-

The RVWMO memory model has been ratified at this time. The ISA modules -marked Ratified, have been ratified at this time. The modules marked -Frozen are not expected to change significantly before being put up -for ratification. The modules marked Draft are expected to change -before ratification.

-
-
-

The document contains the following versions of the RISC-V ISA modules:

-
- ----- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
BaseVersionStatus

RVWMO

2.0

Ratified

RV32I

2.1

Ratified

RV64I

2.1

Ratified

RV32E

1.9

Draft

RV128I

1.7

Draft

Extension

Version

Status

Zifencei

2.0

Ratified

Zicsr

2.0

Ratified

M

2.0

Ratified

A

2.0

Frozen

F

2.2

Ratified

D

2.2

Ratified

Q

2.2

Ratified

C

2.0

Ratified

Ztso

0.1

Frozen

Counters

2.0

Draft

L

0.0

Draft

B

0.0

Draft

J

0.0

Draft

T

0.0

Draft

P

0.2

Draft

V

0.7

Draft

Zam

0.1

Draft

-
-

The changes in this version of the document include:

-
-
-
    -
  • -

    Moved description to Ratified for the ISA modules ratified by the -board in early 2019.

    -
  • -
  • -

    Removed the A extension from ratification.

    -
  • -
  • -

    Changed document version scheme to avoid confusion with versions of -the ISA modules.

    -
  • -
  • -

    Incremented the version numbers of the base integer ISA to 2.1, -reflecting the presence of the ratified RVWMO memory model and exclusion -of FENCE.I, counters, and CSR instructions that were in previous base -ISA.

    -
  • -
  • -

    Incremented the version numbers of the F and D extensions to 2.2, -reflecting that version 2.1 changed the canonical NaN, and version 2.2 -defined the NaN-boxing scheme and changed the definition of the FMIN and -FMAX instructions.

    -
  • -
  • -

    Changed name of document to refer to "unprivileged" instructions as -part of move to separate ISA specifications from platform profile -mandates.

    -
  • -
  • -

    Added clearer and more precise definitions of execution environments, -harts, traps, and memory accesses.

    -
  • -
  • -

    Defined instruction-set categories: standard, reserved, custom, -non-standard, and non-conforming.

    -
  • -
  • -

    Removed text implying operation under alternate endianness, as -alternate-endianness operation has not yet been defined for RISC-V.

    -
  • -
  • -

    Changed description of misaligned load and store behavior. The -specification now allows visible misaligned address traps in execution -environment interfaces, rather than just mandating invisible handling of -misaligned loads and stores in user mode. Also, now allows access-fault -exceptions to be reported for misaligned accesses (including atomics) -that should not be emulated.

    -
  • -
  • -

    Moved FENCE.I out of the mandatory base and into a separate extension, -with Zifencei ISA name. FENCE.I was removed from the Linux user ABI and -is problematic in implementations with large incoherent instruction and -data caches. However, it remains the only standard instruction-fetch -coherence mechanism.

    -
  • -
  • -

    Removed prohibitions on using RV32E with other extensions.

    -
  • -
  • -

    Removed platform-specific mandates that certain encodings produce -illegal-instruction exceptions in RV32E and RV64I chapters.

    -
  • -
  • -

    Counter/timer instructions are now not considered part of the -mandatory base ISA, and so CSR instructions were moved into separate -chapter and marked as version 2.0, with the unprivileged counters moved -into another separate chapter. The counters are not ready for -ratification as there are outstanding issues, including counter -inaccuracies.

    -
  • -
  • -

    A CSR-access ordering model has been added.

    -
  • -
  • -

    Explicitly defined the 16-bit half-precision floating-point format for -floating-point instructions in the 2-bit fmt field.

    -
  • -
  • -

    Defined the signed-zero behavior of FMIN.fmt and FMAX.fmt, and -changed their behavior on signaling-NaN inputs to conform to the -minimumNumber and maximumNumber operations in the proposed IEEE 754-201x -specification.

    -
  • -
  • -

    The memory consistency model, RVWMO, has been defined.

    -
  • -
  • -

    The "Zam" extension, which permits misaligned AMOs and specifies -their semantics, has been defined.

    -
  • -
  • -

    The "Ztso" extension, which enforces a stricter memory consistency -model than RVWMO, has been defined.

    -
  • -
  • -

    Improvements to the description and commentary.

    -
  • -
  • -

    Defined the term IALIGN as shorthand to describe the -instruction-address alignment constraint.

    -
  • -
  • -

    Removed text of P extension chapter as now superseded by active task -group documents.

    -
  • -
  • -

    Removed text of V extension chapter as now superseded by separate -vector extension draft document.

    -
  • -
-
-
-

Preface to Document Version 2.2

-
-
-

This is version 2.2 of the document describing the RISC-V user-level -architecture. The document contains the following versions of the RISC-V -ISA modules:

-
- ----- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
BaseVersionDraft Frozen?

RV32I

2.0

Y

RV32E

1.9

N

RV64I

2.0

Y

RV128I

1.7

N

Extension

Version

Frozen?

M

2.0

Y

A

2.0

Y

F

2.0

Y

D

2.0

Y

Q

2.0

Y

L

0.0

N

C

2.0

Y

B

0.0

N

J

0.0

N

T

0.0

N

P

0.1

N

V

0.7

N

N

1.1

N

-
-

To date, no parts of the standard have been officially ratified by the -RISC-V Foundation, but the components labeled "frozen" above are not -expected to change during the ratification process beyond resolving -ambiguities and holes in the specification.

-
-
-

The major changes in this version of the document include:

-
-
-
    -
  • -

    The previous version of this document was released under a Creative -Commons Attribution 4.0 International License by the original authors, -and this and future versions of this document will be released under the -same license.

    -
  • -
  • -

    Rearranged chapters to put all extensions first in canonical order.

    -
  • -
  • -

    Improvements to the description and commentary.

    -
  • -
  • -

    Modified implicit hinting suggestion on JALR to support more efficient -macro-op fusion of LUI/JALR and AUIPC/JALR pairs.

    -
  • -
  • -

    Clarification of constraints on load-reserved/store-conditional -sequences.

    -
  • -
  • -

    A new table of control and status register (CSR) mappings.

    -
  • -
  • -

    Clarified purpose and behavior of high-order bits of fcsr.

    -
  • -
  • -

    Corrected the description of the FNMADD.fmt and FNMSUB.fmt -instructions, which had suggested the incorrect sign of a zero result.

    -
  • -
  • -

    Instructions FMV.S.X and FMV.X.S were renamed to FMV.W.X and FMV.X.W -respectively to be more consistent with their semantics, which did not -change. The old names will continue to be supported in the tools.

    -
  • -
  • -

    Specified behavior of narrower (stem cec1626943fd22b8bc5d3763224c7c1fFLEN) floating-point -values held in wider f registers using NaN-boxing model.

    -
  • -
  • -

    Defined the exception behavior of FMA(stem 66ad238aa9cc3ef0bde4119bf3f5e449, 0, qNaN).

    -
  • -
  • -

    Added note indicating that the P extension might be reworked into an -integer packed-SIMD proposal for fixed-point operations using the -integer registers.

    -
  • -
  • -

    A draft proposal of the V vector instruction-set extension.

    -
  • -
  • -

    An early draft proposal of the N user-level traps extension.

    -
  • -
  • -

    An expanded pseudoinstruction listing.

    -
  • -
  • -

    Removal of the calling convention chapter, which has been superseded -by the RISC-V ELF psABI Specification (RISC-V ELF PsABI Specification, n.d.).

    -
  • -
  • -

    The C extension has been frozen and renumbered version 2.0.

    -
  • -
-
-
-

Preface to Document Version 2.1

-
-
-

This is version 2.1 of the document describing the RISC-V user-level -architecture. Note the frozen user-level ISA base and extensions IMAFDQ -version 2.0 have not changed from the previous version of this -document (Waterman et al., 2014), but some specification holes have been fixed and the -documentation has been improved. Some changes have been made to the -software conventions.

-
-
-
    -
  • -

    Numerous additions and improvements to the commentary sections.

    -
  • -
  • -

    Separate version numbers for each chapter.

    -
  • -
  • -

    Modification to long instruction encodings stem d31e2e21d71d81f6406a849380df264164 bits to -avoid moving the rd specifier in very long instruction formats.

    -
  • -
  • -

    CSR instructions are now described in the base integer format where -the counter registers are introduced, as opposed to only being -introduced later in the floating-point section (and the companion -privileged architecture manual).

    -
  • -
  • -

    The SCALL and SBREAK instructions have been renamed to ECALL and -EBREAK, respectively. Their encoding and functionality are unchanged.

    -
  • -
  • -

    Clarification of floating-point NaN handling, and a new canonical NaN -value.

    -
  • -
  • -

    Clarification of values returned by floating-point to integer -conversions that overflow.

    -
  • -
  • -

    Clarification of LR/SC allowed successes and required failures, -including use of compressed instructions in the sequence.

    -
  • -
  • -

    A new RV32E base ISA proposal for reduced integer register counts, -supports MAC extensions.

    -
  • -
  • -

    A revised calling convention.

    -
  • -
  • -

    Relaxed stack alignment for soft-float calling convention, and -description of the RV32E calling convention.

    -
  • -
  • -

    A revised proposal for the C compressed extension, version 1.9 .

    -
  • -
-
-
-

Preface to Version 2.0

-
-
-

This is the second release of the user ISA specification, and we intend -the specification of the base user ISA plus general extensions (i.e., -IMAFD) to remain fixed for future development. The following changes -have been made since Version 1.0 (Waterman et al., 2011) of this ISA specification.

-
-
-
    -
  • -

    The ISA has been divided into an integer base with several standard -extensions.

    -
  • -
  • -

    The instruction formats have been rearranged to make immediate -encoding more efficient.

    -
  • -
  • -

    The base ISA has been defined to have a little-endian memory system, -with big-endian or bi-endian as non-standard variants.

    -
  • -
  • -

    Load-Reserved/Store-Conditional (LR/SC) instructions have been added -in the atomic instruction extension.

    -
  • -
  • -

    AMOs and LR/SC can support the release consistency model.

    -
  • -
  • -

    The FENCE instruction provides finer-grain memory and I/O orderings.

    -
  • -
  • -

    An AMO for fetch-and-XOR (AMOXOR) has been added, and the encoding for -AMOSWAP has been changed to make room.

    -
  • -
  • -

    The AUIPC instruction, which adds a 20-bit upper immediate to the PC, -replaces the RDNPC instruction, which only read the current PC value. -This results in significant savings for position-independent code.

    -
  • -
  • -

    The JAL instruction has now moved to the U-Type format with an -explicit destination register, and the J instruction has been dropped -being replaced by JAL with rd=x0. This removes the only instruction -with an implicit destination register and removes the J-Type instruction -format from the base ISA. There is an accompanying reduction in JAL -reach, but a significant reduction in base ISA complexity.

    -
  • -
  • -

    The static hints on the JALR instruction have been dropped. The hints -are redundant with the rd and rs1 register specifiers for code -compliant with the standard calling convention.

    -
  • -
  • -

    The JALR instruction now clears the lowest bit of the calculated -target address, to simplify hardware and to allow auxiliary information -to be stored in function pointers.

    -
  • -
  • -

    The MFTX.S and MFTX.D instructions have been renamed to FMV.X.S and -FMV.X.D, respectively. Similarly, MXTF.S and MXTF.D instructions have -been renamed to FMV.S.X and FMV.D.X, respectively.

    -
  • -
  • -

    The MFFSR and MTFSR instructions have been renamed to FRCSR and FSCSR, -respectively. FRRM, FSRM, FRFLAGS, and FSFLAGS instructions have been -added to individually access the rounding mode and exception flags -subfields of the fcsr.

    -
  • -
  • -

    The FMV.X.S and FMV.X.D instructions now source their operands from -rs1, instead of rs2. This change simplifies datapath design.

    -
  • -
  • -

    FCLASS.S and FCLASS.D floating-point classify instructions have been -added.

    -
  • -
  • -

    A simpler NaN generation and propagation scheme has been adopted.

    -
  • -
  • -

    For RV32I, the system performance counters have been extended to -64-bits wide, with separate read access to the upper and lower 32 bits.

    -
  • -
  • -

    Canonical NOP and MV encodings have been defined.

    -
  • -
  • -

    Standard instruction-length encodings have been defined for 48-bit, -64-bit, and stem d31e2e21d71d81f6406a849380df264164-bit instructions.

    -
  • -
  • -

    Description of a 128-bit address space variant, RV128, has been added.

    -
  • -
  • -

    Major opcodes in the 32-bit base instruction format have been -allocated for user-defined custom extensions.

    -
  • -
  • -

    A typographical error that suggested that stores source their data -from rd has been corrected to refer to rs2.

    -
  • -
-
-
-
-
-

1. Introduction

-
-
-

RISC-V (pronounced "risk-five") is a new instruction-set architecture -(ISA) that was originally designed to support computer architecture -research and education, but which we now hope will also become a -standard free and open architecture for industry implementations. Our -goals in defining RISC-V include:

-
-
-
    -
  • -

    A completely open ISA that is freely available to academia and -industry.

    -
  • -
  • -

    A real ISA suitable for direct native hardware implementation, not -just simulation or binary translation.

    -
  • -
  • -

    An ISA that avoids "over-architecting" for a particular -microarchitecture style (e.g., microcoded, in-order, decoupled, -out-of-order) or implementation technology (e.g., full-custom, ASIC, -FPGA), but which allows efficient implementation in any of these.

    -
  • -
  • -

    An ISA separated into a small base integer ISA, usable by itself as -a base for customized accelerators or for educational purposes, and -optional standard extensions, to support general-purpose software -development.

    -
  • -
  • -

    Support for the revised 2008 IEEE-754 floating-point standard. (ANSI/IEEE Std 754-2008, IEEE Standard for Floating-Point Arithmetic, 2008)

    -
  • -
  • -

    An ISA supporting extensive ISA extensions and specialized variants.

    -
  • -
  • -

    Both 32-bit and 64-bit address space variants for applications, -operating system kernels, and hardware implementations.

    -
  • -
  • -

    An ISA with support for highly parallel multicore or manycore -implementations, including heterogeneous multiprocessors.

    -
  • -
  • -

    Optional variable-length instructions to both expand available -instruction encoding space and to support an optional dense instruction -encoding for improved performance, static code size, and energy -efficiency.

    -
  • -
  • -

    A fully virtualizable ISA to ease hypervisor development.

    -
  • -
  • -

    An ISA that simplifies experiments with new privileged architecture -designs.

    -
  • -
-
-
- - - - - -
- - -
-

Commentary on our design decisions is formatted as in this paragraph. -This non-normative text can be skipped if the reader is only interested -in the specification itself.

-
-
-
-
- - - - - -
- - -
-

The name RISC-V was chosen to represent the fifth major RISC ISA design -from UC Berkeley (RISC-I (Patterson & Séquin, 1981), RISC-II (Katevenis et al., 1983), SOAR (Ungar et al., 1984), and SPUR (Lee et al., 1989) were the first -four). We also pun on the use of the Roman numeral "V" to signify -"variations" and "vectors", as support for a range of architecture -research, including various data-parallel accelerators, is an explicit -goal of the ISA design.

-
-
-
-
-

-The RISC-V ISA is defined avoiding implementation details as much as -possible (although commentary is included on implementation-driven -decisions) and should be read as the software-visible interface to a -wide variety of implementations rather than as the design of a -particular hardware artifact. The RISC-V manual is structured in two -volumes. This volume covers the design of the base unprivileged -instructions, including optional unprivileged ISA extensions. -Unprivileged instructions are those that are generally usable in all -privilege modes in all privileged architectures, though behavior might -vary depending on privilege mode and privilege architecture. The second -volume provides the design of the first ("classic") privileged -architecture. The manuals use IEC 80000-13:2008 conventions, with a byte -of 8 bits.

-
-
- - - - - -
- - -
-

In the unprivileged ISA design, we tried to remove any dependence on -particular microarchitectural features, such as cache line size, or on -privileged architecture details, such as page translation. This is both -for simplicity and to allow maximum flexibility for alternative -microarchitectures or alternative privileged architectures.

-
-
-
-
-

1.1. RISC-V Hardware Platform Terminology

-
-

A RISC-V hardware platform can contain one or more RISC-V-compatible -processing cores together with other non-RISC-V-compatible cores, -fixed-function accelerators, various physical memory structures, I/O -devices, and an interconnect structure to allow the components to -communicate. -

-
-
-

A component is termed a core if it contains an independent instruction -fetch unit. A RISC-V-compatible core might support multiple -RISC-V-compatible hardware threads, or harts, through multithreading. -

-
-
-

A RISC-V core might have additional specialized instruction-set -extensions or an added coprocessor. We use the term coprocessor to -refer to a unit that is attached to a RISC-V core and is mostly -sequenced by a RISC-V instruction stream, but which contains additional -architectural state and instruction-set extensions, and possibly some -limited autonomy relative to the primary RISC-V instruction stream.

-
-
-

We use the term accelerator to refer to either a non-programmable -fixed-function unit or a core that can operate autonomously but is -specialized for certain tasks. In RISC-V systems, we expect many -programmable accelerators will be RISC-V-based cores with specialized -instruction-set extensions and/or customized coprocessors. An important -class of RISC-V accelerators are I/O accelerators, which offload I/O -processing tasks from the main application cores. -

-
-
-

The system-level organization of a RISC-V hardware platform can range -from a single-core microcontroller to a many-thousand-node cluster of -shared-memory manycore server nodes. Even small systems-on-a-chip might -be structured as a hierarchy of multicomputers and/or multiprocessors to -modularize development effort or to provide secure isolation between -subsystems. -

-
-
-
-

1.2. RISC-V Software Execution Environments and Harts

-
-

The behavior of a RISC-V program depends on the execution environment in -which it runs. A RISC-V execution environment interface (EEI) defines -the initial state of the program, the number and type of harts in the -environment including the privilege modes supported by the harts, the -accessibility and attributes of memory and I/O regions, the behavior of -all legal instructions executed on each hart (i.e., the ISA is one -component of the EEI), and the handling of any interrupts or exceptions -raised during execution including environment calls. Examples of EEIs -include the Linux application binary interface (ABI), or the RISC-V -supervisor binary interface (SBI). The implementation of a RISC-V -execution environment can be pure hardware, pure software, or a -combination of hardware and software. For example, opcode traps and -software emulation can be used to implement functionality not provided -in hardware. Examples of execution environment implementations include:

-
-
-
    -
  • -

    "Bare metal" hardware platforms where harts are directly implemented -by physical processor threads and instructions have full access to the -physical address space. The hardware platform defines an execution -environment that begins at power-on reset.

    -
  • -
  • -

    RISC-V operating systems that provide multiple user-level execution -environments by multiplexing user-level harts onto available physical -processor threads and by controlling access to memory via virtual -memory.

    -
  • -
  • -

    RISC-V hypervisors that provide multiple supervisor-level execution -environments for guest operating systems.

    -
  • -
  • -

    RISC-V emulators, such as Spike, QEMU or rv8, which emulate RISC-V -harts on an underlying x86 system, and which can provide either a -user-level or a supervisor-level execution environment.

    -
  • -
-
-
- - - - - -
- - -
-

A bare hardware platform can be considered to define an EEI, where the -accessible harts, memory, and other devices populate the environment, -and the initial state is that at power-on reset. Generally, most -software is designed to use a more abstract interface to the hardware, -as more abstract EEIs provide greater portability across different -hardware platforms. Often EEIs are layered on top of one another, where -one higher-level EEI uses another lower-level EEI.

-
-
-
-
-

-From the perspective of software running in a given execution -environment, a hart is a resource that autonomously fetches and executes -RISC-V instructions within that execution environment. In this respect, -a hart behaves like a hardware thread resource even if time-multiplexed -onto real hardware by the execution environment. Some EEIs support the -creation and destruction of additional harts, for example, via -environment calls to fork new harts.

-
-
-

The execution environment is responsible for ensuring the eventual -forward progress of each of its harts. For a given hart, that -responsibility is suspended while the hart is exercising a mechanism -that explicitly waits for an event, such as the wait-for-interrupt -instruction defined in Volume II of this specification; and that -responsibility ends if the hart is terminated. The following events -constitute forward progress:

-
-
-
    -
  • -

    The retirement of an instruction.

    -
  • -
  • -

    A trap, as defined in Section 1.6.

    -
  • -
  • -

    Any other event defined by an extension to constitute forward -progress.

    -
  • -
-
-
- - - - - -
- - -
-

The term hart was introduced in the work on Lithe (Pan et al., 2009) and (Pan et al., 2010) to provide a term to -represent an abstract execution resource as opposed to a software thread -programming abstraction.

-
-
-

The important distinction between a hardware thread (hart) and a -software thread context is that the software running inside an execution -environment is not responsible for causing progress of each of its -harts; that is the responsibility of the outer execution environment. So -the environment’s harts operate like hardware threads from the -perspective of the software inside the execution environment.

-
-
-

An execution environment implementation might time-multiplex a set of -guest harts onto fewer host harts provided by its own execution -environment but must do so in a way that guest harts operate like -independent hardware threads. In particular, if there are more guest -harts than host harts then the execution environment must be able to -preempt the guest harts and must not wait indefinitely for guest -software on a guest hart to "yield" control of the guest hart.

-
-
-
-
-
-

1.3. RISC-V ISA Overview

-
-

A RISC-V ISA is defined as a base integer ISA, which must be present in -any implementation, plus optional extensions to the base ISA. The base -integer ISAs are very similar to that of the early RISC processors -except with no branch delay slots and with support for optional -variable-length instruction encodings. A base is carefully restricted to -a minimal set of instructions sufficient to provide a reasonable target -for compilers, assemblers, linkers, and operating systems (with -additional privileged operations), and so provides a convenient ISA and -software toolchain "skeleton" around which more customized processor -ISAs can be built.

-
-
-

Although it is convenient to speak of the RISC-V ISA, RISC-V is -actually a family of related ISAs, of which there are currently four -base ISAs. Each base integer instruction set is characterized by the -width of the integer registers and the corresponding size of the address -space and by the number of integer registers. There are two primary base -integer variants, RV32I and RV64I, described in -Chapter 2 and Chapter 4, which provide 32-bit -or 64-bit address spaces respectively. We use the term XLEN to refer to -the width of an integer register in bits (either 32 or 64). -Chapter 6 describes the RV32E and RV64E subset variants of the -RV32I or RV64I base instruction sets respectively, which have been added to support small -microcontrollers, and which have half the number of integer registers. -Chapter 8 sketches a future RV128I variant of the -base integer instruction set supporting a flat 128-bit address space -(XLEN=128). The base integer instruction sets use a two’s-complement -representation for signed integer values.

-
-
- - - - - -
- - -
-

Although 64-bit address spaces are a requirement for larger systems, we -believe 32-bit address spaces will remain adequate for many embedded and -client devices for decades to come and will be desirable to lower memory -traffic and energy consumption. In addition, 32-bit address spaces are -sufficient for educational purposes. A larger flat 128-bit address space -might eventually be required, so we ensured this could be accommodated -within the RISC-V ISA framework.

-
-
-
-
- - - - - -
- - -
-

The four base ISAs in RISC-V are treated as distinct base ISAs. A common -question is why is there not a single ISA, and in particular, why is -RV32I not a strict subset of RV64I? Some earlier ISA designs (SPARC, -MIPS) adopted a strict superset policy when increasing address space -size to support running existing 32-bit binaries on new 64-bit hardware.

-
-
-

The main advantage of explicitly separating base ISAs is that each base -ISA can be optimized for its needs without requiring to support all the -operations needed for other base ISAs. For example, RV64I can omit -instructions and CSRs that are only needed to cope with the narrower -registers in RV32I. The RV32I variants can use encoding space otherwise -reserved for instructions only required by wider address-space variants.

-
-
-

The main disadvantage of not treating the design as a single ISA is that -it complicates the hardware needed to emulate one base ISA on another -(e.g., RV32I on RV64I). However, differences in addressing and -illegal-instruction traps generally mean some mode switch would be required in -hardware in any case even with full superset instruction encodings, and -the different RISC-V base ISAs are similar enough that supporting -multiple versions is relatively low cost. Although some have proposed -that the strict superset design would allow legacy 32-bit libraries to -be linked with 64-bit code, this is impractical in practice, even with -compatible encodings, due to the differences in software calling -conventions and system-call interfaces.

-
-
-

The RISC-V privileged architecture provides fields in misa to control -the unprivileged ISA at each level to support emulating different base -ISAs on the same hardware. We note that newer SPARC and MIPS ISA -revisions have deprecated support for running 32-bit code unchanged on -64-bit systems.

-
-
-

A related question is why there is a different encoding for 32-bit adds -in RV32I (ADD) and RV64I (ADDW)? The ADDW opcode could be used for -32-bit adds in RV32I and ADDD for 64-bit adds in RV64I, instead of the -existing design which uses the same opcode ADD for 32-bit adds in RV32I -and 64-bit adds in RV64I with a different opcode ADDW for 32-bit adds in -RV64I. This would also be more consistent with the use of the same LW -opcode for 32-bit load in both RV32I and RV64I. The very first versions -of RISC-V ISA did have a variant of this alternate design, but the -RISC-V design was changed to the current choice in January 2011. Our -focus was on supporting 32-bit integers in the 64-bit ISA not on -providing compatibility with the 32-bit ISA, and the motivation was to -remove the asymmetry that arose from having not all opcodes in RV32I -have a *W suffix (e.g., ADDW, but AND not ANDW). In hindsight, this was -perhaps not well-justified and a consequence of designing both ISAs at -the same time as opposed to adding one later to sit on top of another, -and also from a belief we had to fold platform requirements into the ISA -spec which would imply that all the RV32I instructions would have been -required in RV64I. It is too late to change the encoding now, but this -is also of little practical consequence for the reasons stated above.

-
-
-

It has been noted we could enable the *W variants as an extension to -RV32I systems to provide a common encoding across RV64I and a future -RV32 variant.

-
-
-
-
-

RISC-V has been designed to support extensive customization and -specialization. Each base integer ISA can be extended with one or more -optional instruction-set extensions. An extension may be categorized as -either standard, custom, or non-conforming. For this purpose, we divide -each RISC-V instruction-set encoding space (and related encoding spaces -such as the CSRs) into three disjoint categories: standard, -reserved, and custom. Standard extensions and encodings are defined -by RISC-V International; any extensions not defined by RISC-V International are -non-standard. Each base ISA and its standard extensions use only -standard encodings, and shall not conflict with each other in their uses -of these encodings. Reserved encodings are currently not defined but are -saved for future standard extensions; once thus used, they become -standard encodings. Custom encodings shall never be used for standard -extensions and are made available for vendor-specific non-standard -extensions. Non-standard extensions are either custom extensions, that -use only custom encodings, or non-conforming extensions, that use any -standard or reserved encoding. Instruction-set extensions are generally -shared but may provide slightly different functionality depending on the -base ISA. Chapter 38 describes various ways -of extending the RISC-V ISA. We have also developed a naming convention -for RISC-V base instructions and instruction-set extensions, described -in detail in Chapter 39.

-
-
-

To support more general software development, a set of standard -extensions are defined to provide integer multiply/divide, atomic -operations, and single and double-precision floating-point arithmetic. -The base integer ISA is named "I" (prefixed by RV32 or RV64 depending -on integer register width), and contains integer computational -instructions, integer loads, integer stores, and control-flow -instructions. The standard integer multiplication and division extension -is named "M", and adds instructions to multiply and divide values held -in the integer registers. The standard atomic instruction extension, -denoted by "A", adds instructions that atomically read, modify, and -write memory for inter-processor synchronization. The standard -single-precision floating-point extension, denoted by "F", adds -floating-point registers, single-precision computational instructions, -and single-precision loads and stores. The standard double-precision -floating-point extension, denoted by "D", expands the floating-point -registers, and adds double-precision computational instructions, loads, -and stores. The standard "C" compressed instruction extension provides -narrower 16-bit forms of common instructions.

-
-
-

Beyond the base integer ISA and these standard extensions, we believe -it is rare that a new instruction will provide a significant benefit for -all applications, although it may be very beneficial for a certain -domain. As energy efficiency concerns are forcing greater -specialization, we believe it is important to simplify the required -portion of an ISA specification. Whereas other architectures usually -treat their ISA as a single entity, which changes to a new version as -instructions are added over time, RISC-V will endeavor to keep the base -and each standard extension constant over time, and instead layer new -instructions as further optional extensions. For example, the base -integer ISAs will continue as fully supported standalone ISAs, -regardless of any subsequent extensions.

-
-
-
-

1.4. Memory

-
-

A RISC-V hart has a single byte-addressable address space of -stem d27f663ab9e7364c59a70b83260dca56 bytes for all memory accesses. A word of -memory is defined as 32 bits (4 bytes). Correspondingly, a halfword is 16 bits (2 bytes), a -doubleword is 64 bits (8 bytes), and a quadword is 128 bits (16 bytes). The memory address space is -circular, so that the byte at address stem 52d22d5bedfb57a75620c1335aa01e21 is -adjacent to the byte at address zero. Accordingly, memory address -computations done by the hardware ignore overflow and instead wrap -around modulo stem d27f663ab9e7364c59a70b83260dca56.

-
-
-

The execution environment determines the mapping of hardware resources -into a hart’s address space. Different address ranges of a hart’s -address space may (1) contain main memory, or -(2) contain one or more I/O devices. Reads and writes of I/O devices -may have visible side effects, but accesses to main memory cannot. -Vacant address ranges are not a separate category but can be represented as -either main memory or I/O regions that are not accessible. -Although it is possible for the execution environment to call everything -in a hart’s address space an I/O device, it is usually expected that -some portion will be specified as main memory.

-
-
-

When a RISC-V platform has multiple harts, the address spaces of any two -harts may be entirely the same, or entirely different, or may be partly -different but sharing some subset of resources, mapped into the same or -different address ranges.

-
-
- - - - - -
- - -
-

For a purely "bare metal" environment, all harts may see an identical -address space, accessed entirely by physical addresses. However, when -the execution environment includes an operating system employing address -translation, it is common for each hart to be given a virtual address -space that is largely or entirely its own.

-
-
-
-
-

-
-
-

Executing each RISC-V machine instruction entails one or more memory -accesses, subdivided into implicit and explicit accesses. For each -instruction executed, an implicit memory read (instruction fetch) is -done to obtain the encoded instruction to execute. Many RISC-V -instructions perform no further memory accesses beyond instruction -fetch. Specific load and store instructions perform an explicit read -or write of memory at an address determined by the instruction. The -execution environment may dictate that instruction execution performs -other implicit memory accesses (such as to implement address -translation) beyond those documented for the unprivileged ISA.

-
-
-

The execution environment determines what portions of the -address space are accessible for each kind of memory access. For -example, the set of locations that can be implicitly read for -instruction fetch may or may not have any overlap with the set of -locations that can be explicitly read by a load instruction; and the set -of locations that can be explicitly written by a store instruction may -be only a subset of locations that can be read. Ordinarily, if an -instruction attempts to access memory at an inaccessible address, an -exception is raised for the instruction.

-
-
-

Except when specified otherwise, implicit reads that do not raise an -exception and that have no side effects may occur arbitrarily early and -speculatively, even before the machine could possibly prove that the -read will be needed. For instance, a valid implementation could attempt -to read all of main memory at the earliest opportunity, cache as many -fetchable (executable) bytes as possible for later instruction fetches, -and avoid reading main memory for instruction fetches ever again. To -ensure that certain implicit reads are ordered only after writes to the -same memory locations, software must execute specific fence or -cache-control instructions defined for this purpose (such as the FENCE.I -instruction defined in Chapter 6). -

-
-
-

The memory accesses (implicit or explicit) made by a hart may appear to -occur in a different order as perceived by another hart or by any other -agent that can access the same memory. This perceived reordering of -memory accesses is always constrained, however, by the applicable memory -consistency model. The default memory consistency model for RISC-V is -the RISC-V Weak Memory Ordering (RVWMO), defined in -Chapter 18 and in appendices. Optionally, -an implementation may adopt the stronger model of Total Store Ordering, -as defined in Chapter 19. The execution environment -may also add constraints that further limit the perceived reordering of -memory accesses. Since the RVWMO model is the weakest model allowed for -any RISC-V implementation, software written for this model is compatible -with the actual memory consistency rules of all RISC-V implementations. -As with implicit reads, software must execute fence or cache-control -instructions to ensure specific ordering of memory accesses beyond the -requirements of the assumed memory consistency model and execution -environment.

-
-
-
-

1.5. Base Instruction-Length Encoding

-
-

The base RISC-V ISA has fixed-length 32-bit instructions that must be -naturally aligned on 32-bit boundaries. However, the standard RISC-V -encoding scheme is designed to support ISA extensions with -variable-length instructions, where each instruction can be any number -of 16-bit instruction parcels in length and parcels are naturally -aligned on 16-bit boundaries. The standard compressed ISA extension -described in Chapter 28 reduces code size by -providing compressed 16-bit instructions and relaxes the alignment -constraints to allow all instructions (16 bit and 32 bit) to be aligned -on any 16-bit boundary to improve code density.

-
-
-

We use the term IALIGN (measured in bits) to refer to the -instruction-address alignment constraint the implementation enforces. -IALIGN is 32 bits in the base ISA, but some ISA extensions, including -the compressed ISA extension, relax IALIGN to 16 bits. IALIGN may not -take on any value other than 16 or 32. -

-
-
-

We use the term ILEN (measured in bits) to refer to the maximum -instruction length supported by an implementation, and which is always a -multiple of IALIGN. For implementations supporting only a base -instruction set, ILEN is 32 bits. Implementations supporting longer -instructions have larger values of ILEN.

-
-
-

Table 1 illustrates the standard -RISC-V instruction-length encoding convention. All the 32-bit -instructions in the base ISA have their lowest two bits set to 11. The -optional compressed 16-bit instruction-set extensions have their lowest -two bits equal to 00, 01, or 10.

-
-
-

1.5.1. Expanded Instruction-Length Encoding

-
-

A portion of the 32-bit instruction-encoding space has been tentatively -allocated for instructions longer than 32 bits. The entirety of this -space is reserved at this time, and the following proposal for encoding -instructions longer than 32 bits is not considered frozen. -

-
-
-

Standard instruction-set extensions encoded with more than 32 bits have -additional low-order bits set to 1, with the conventions for 48-bit -and 64-bit lengths shown in -Table 1. Instruction lengths -between 80 bits and 176 bits are encoded using a 3-bit field in bits -[14:12] giving the number of 16-bit words in addition to the first -5stem 715efe442a3a591fd63a05098adb4ab716-bit words. The encoding with bits [14:12] set to -"111" is reserved for future longer instruction encodings.

-
- - ------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 1. RISC-V instruction length encoding. Only the 16-bit and 32-bit encodings are considered frozen at this time.
xxxxxxxxxxxxxxaa16-bit (aa≠11)

xxxxxxxxxxxxxxxx

xxxxxxxxxxxbbb11

32-bit (bbb≠111)

stem 56ee3cdc7603fdb6d68d8551a34e9a36xxxx

xxxxxxxxxxxxxxxx

xxxxxxxxxx011111

48-bit

stem 56ee3cdc7603fdb6d68d8551a34e9a36xxxx

xxxxxxxxxxxxxxxx

xxxxxxxxx0111111

64-bit

stem 56ee3cdc7603fdb6d68d8551a34e9a36xxxx

xxxxxxxxxxxxxxxx

xnnnxxxxx1111111

(80+16*nnn)-bit, nnn≠111

stem 56ee3cdc7603fdb6d68d8551a34e9a36xxxx

xxxxxxxxxxxxxxxx

x111xxxxx1111111

Reserved for ≥192-bits

Byte Address:

base+4

base+2

base

-
- - - - - -
- - -
-

Given the code size and energy savings of a compressed format, we wanted -to build in support for a compressed format to the ISA encoding scheme -rather than adding this as an afterthought, but to allow simpler -implementations we didn’t want to make the compressed format mandatory. -We also wanted to optionally allow longer instructions to support -experimentation and larger instruction-set extensions. Although our -encoding convention required a tighter encoding of the core RISC-V ISA, -this has several beneficial effects. -

-
-
-

An implementation of the standard IMAFD ISA need only hold the -most-significant 30 bits in instruction caches (a 6.25% saving). On -instruction cache refills, any instructions encountered with either low -bit clear should be recoded into illegal 30-bit instructions before -storing in the cache to preserve illegal-instruction exception behavior.

-
-
-

Perhaps more importantly, by condensing our base ISA into a subset of -the 32-bit instruction word, we leave more space available for -non-standard and custom extensions. In particular, the base RV32I ISA -uses less than 1/8 of the encoding space in the 32-bit instruction word. -As described in Chapter 38, an implementation that does not require support -for the standard compressed instruction extension can map 3 additional non-conforming -30-bit instruction spaces into the 32-bit fixed-width format, while preserving -support for standard ≥32-bit instruction-set -extensions. Further, if the implementation also does not need -instructions >32-bits in length, it can recover a further -four major opcodes for non-conforming extensions.

-
-
-
-
-

Encodings with bits [15:0] all zeros are defined as illegal -instructions. These instructions are considered to be of minimal length: -16 bits if any 16-bit instruction-set extension is present, otherwise 32 -bits. The encoding with bits [ILEN-1:0] all ones is also illegal; this -instruction is considered to be ILEN bits long.

-
-
- - - - - -
- - -
-

We consider it a feature that any length of instruction containing all -zero bits is not legal, as this quickly traps erroneous jumps into -zeroed memory regions. Similarly, we also reserve the instruction -encoding containing all ones to be an illegal instruction, to catch the -other common pattern observed with unprogrammed non-volatile memory -devices, disconnected memory buses, or broken memory devices.

-
-
-

Software can rely on a naturally aligned 32-bit word containing zero to -act as an illegal instruction on all RISC-V implementations, to be used -by software where an illegal instruction is explicitly desired. Defining -a corresponding known illegal value for all ones is more difficult due -to the variable-length encoding. Software cannot generally use the -illegal value of ILEN bits of all 1s, as software might not know ILEN -for the eventual target machine (e.g., if software is compiled into a -standard binary library used by many different machines). Defining a -32-bit word of all ones as illegal was also considered, as all machines -must support a 32-bit instruction size, but this requires the -instruction-fetch unit on machines with ILEN >32 report an -illegal-instruction exception rather than an access-fault exception when -such an instruction borders a protection boundary, complicating -variable-instruction-length fetch and decode.

-
-
-
-
-

-RISC-V base ISAs have either little-endian or big-endian memory systems, -with the privileged architecture further defining bi-endian operation. -Instructions are stored in memory as a sequence of 16-bit little-endian -parcels, regardless of memory system endianness. Parcels forming one -instruction are stored at increasing halfword addresses, with the -lowest-addressed parcel holding the lowest-numbered bits in the -instruction specification. - -

-
-
- - - - - -
- - -
-

We originally chose little-endian byte ordering for the RISC-V memory -system because little-endian systems are currently dominant commercially -(all x86 systems; iOS, Android, and Windows for ARM). A minor point is -that we have also found little-endian memory systems to be more natural -for hardware designers. However, certain application areas, such as IP -networking, operate on big-endian data structures, and certain legacy -code bases have been built assuming big-endian processors, so we have -defined big-endian and bi-endian variants of RISC-V.

-
-
-

We have to fix the order in which instruction parcels are stored in -memory, independent of memory system endianness, to ensure that the -length-encoding bits always appear first in halfword address order. This -allows the length of a variable-length instruction to be quickly -determined by an instruction-fetch unit by examining only the first few -bits of the first 16-bit instruction parcel.

-
-
-

We further make the instruction parcels themselves little-endian to -decouple the instruction encoding from the memory system endianness -altogether. This design benefits both software tooling and bi-endian -hardware. Otherwise, for instance, a RISC-V assembler or disassembler -would always need to know the intended active endianness, despite that -in bi-endian systems, the endianness mode might change dynamically -during execution. In contrast, by giving instructions a fixed -endianness, it is sometimes possible for carefully written software to -be endianness-agnostic even in binary form, much like -position-independent code.

-
-
-

The choice to have instructions be only little-endian does have -consequences, however, for RISC-V software that encodes or decodes -machine instructions. Big-endian JIT compilers, for example, must swap -the byte order when storing to instruction memory.

-
-
-

Once we had decided to fix on a little-endian instruction encoding, this -naturally led to placing the length-encoding bits in the LSB positions -of the instruction format to avoid breaking up opcode fields.

-
-
-
-
-
-
-

1.6. Exceptions, Traps, and Interrupts

-
-

We use the term exception to refer to an unusual condition occurring -at run time associated with an instruction in the current RISC-V hart. -We use the term interrupt to refer to an external asynchronous event -that may cause a RISC-V hart to experience an unexpected transfer of -control. We use the term trap to refer to the transfer of control to a -trap handler caused by either an exception or an interrupt. - - -

-
-
-

The instruction descriptions in following chapters describe conditions -that can raise an exception during execution. The general behavior of -most RISC-V EEIs is that a trap to some handler occurs when an exception -is signaled on an instruction (except for floating-point exceptions, -which, in the standard floating-point extensions, do not cause traps). -The manner in which interrupts are generated, routed to, and enabled by -a hart depends on the EEI.

-
-
- - - - - -
- - -
-

Our use of "exception" and "trap" is compatible with that in the -IEEE-754 floating-point standard.

-
-
-
-
-

How traps are handled and made visible to software running on the hart -depends on the enclosing execution environment. From the perspective of -software running inside an execution environment, traps encountered by a -hart at runtime can have four different effects:

-
-
-
-
Contained Trap
-
-

The trap is visible to, and handled by, software running inside the -execution environment. For example, in an EEI providing both -supervisor and user mode on harts, an ECALL by a user-mode hart will -generally result in a transfer of control to a supervisor-mode handler -running on the same hart. Similarly, in the same environment, when a -hart is interrupted, an interrupt handler will be run in supervisor -mode on the hart.

-
-
Requested Trap
-
-

The trap is a synchronous exception that is an explicit call to the -execution environment requesting an action on behalf of software -inside the execution environment. An example is a system call. In this -case, execution may or may not resume on the hart after the requested -action is taken by the execution environment. For example, a system -call could remove the hart or cause an orderly termination of the -entire execution environment.

-
-
Invisible Trap
-
-

The trap is handled transparently by the execution environment and -execution resumes normally after the trap is handled. Examples include -emulating missing instructions, handling non-resident page faults in a -demand-paged virtual-memory system, or handling device interrupts for -a different job in a multiprogrammed machine. In these cases, the -software running inside the execution environment is not aware of the -trap (we ignore timing effects in these definitions).

-
-
Fatal Trap
-
-

The trap represents a fatal failure and causes the execution -environment to terminate execution. Examples include failing a -virtual-memory page-protection check or allowing a watchdog timer to -expire. Each EEI should define how execution is terminated and -reported to an external environment.

-
-
-
-
-

Table 2 shows the characteristics of each kind of trap.

-
- - ------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 2. Characteristics of traps
ContainedRequestedInvisibleFatal

Execution terminates

No

No1

No

Yes

Software is oblivious

No

No

Yes

Yes2

Handled by environment

No

Yes

Yes

Yes

-
-

1 Termination may be requested
-2 Imprecise fatal traps might be observable by software

-
-
-

The EEI defines for each trap whether it is handled precisely, though -the recommendation is to maintain preciseness where possible. Contained -and requested traps can be observed to be imprecise by software inside -the execution environment. Invisible traps, by definition, cannot be -observed to be precise or imprecise by software running inside the -execution environment. Fatal traps can be observed to be imprecise by -software running inside the execution environment, if known-errorful -instructions do not cause immediate termination.

-
-
-

Because this document describes unprivileged instructions, traps are -rarely mentioned. Architectural means to handle contained traps are -defined in the privileged architecture manual, along with other features -to support richer EEIs. Unprivileged instructions that are defined -solely to cause requested traps are documented here. Invisible traps -are, by their nature, out of scope for this document. Instruction -encodings that are not defined here and not defined by some other means -may cause a fatal trap.

-
-
-
-

1.7. UNSPECIFIED Behaviors and Values

-
-

The architecture fully describes what implementations must do and any -constraints on what they may do. In cases where the architecture -intentionally does not constrain implementations, the term UNSPECIFIED is -explicitly used. - -

-
-
-

The term UNSPECIFIED refers to a behavior or value that is intentionally -unconstrained. The definition of these behaviors or values is open to -extensions, platform standards, or implementations. Extensions, platform -standards, or implementation documentation may provide normative content -to further constrain cases that the base architecture defines as UNSPECIFIED.

-
-
-

Like the base architecture, extensions should fully describe allowable -behavior and values and use the term UNSPECIFIED for cases that are intentionally -unconstrained. These cases may be constrained or defined by other -extensions, platform standards, or implementations.

-
-
-
-
-
-

2. RV32I Base Integer Instruction Set, Version 2.1

-
-
-

This chapter describes the RV32I base integer instruction set.

-
-
- - - - - -
- - -
-

RV32I was designed to be sufficient to form a compiler target and to -support modern operating system environments. The ISA was also designed -to reduce the hardware required in a minimal implementation. RV32I -contains 40 unique instructions, though a simple implementation might -cover the ECALL/EBREAK instructions with a single SYSTEM hardware -instruction that always traps and might be able to implement the FENCE -instruction as a NOP, reducing base instruction count to 38 total. RV32I -can emulate almost any other ISA extension (except the A extension, -which requires additional hardware support for atomicity).

-
-
-

In practice, a hardware implementation including the machine-mode -privileged architecture will also require the 6 CSR instructions.

-
-
-

Subsets of the base integer ISA might be useful for pedagogical -purposes, but the base has been defined such that there should be little -incentive to subset a real hardware implementation beyond omitting -support for misaligned memory accesses and treating all SYSTEM -instructions as a single trap.

-
-
-
-
- - - - - -
- - -
-

The standard RISC-V assembly language syntax is documented in the -Assembly Programmer’s Manual (RISC-V Assembly Programmer’s Manual, n.d.).

-
-
-
-
- - - - - -
- - -
-

Most of the commentary for RV32I also applies to the RV64I base.

-
-
-
-
-

2.1. Programmers' Model for Base Integer ISA

-
-

Table 3 shows the unprivileged state for the base -integer ISA. For RV32I, the 32 x registers are each 32 bits wide, -i.e., XLEN=32. Register x0 is hardwired with all bits equal to 0. -General purpose registers x1-x31 hold values that various -instructions interpret as a collection of Boolean values, or as two’s -complement signed binary integers or unsigned binary integers.

-
-
-

There is one additional unprivileged register: the program counter pc -holds the address of the current instruction.

-
- - ----- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 3. RISC-V base unprivileged integer register state.
XLEN-10

x0/zero

x1

x2

x3

x4

x5

x6

x7

x8

x9

x10

x11

x12

x13

x14

x15

x16

x17

x18

x19

x20

x21

x22

x23

x24

x25

x26

x27

x28

x29

x30

x31

XLEN

XLEN-1

0

pc

XLEN

-
- - - - - -
- - -
-

There is no dedicated stack pointer or subroutine return address link -register in the Base Integer ISA; the instruction encoding allows any -x register to be used for these purposes. However, the standard -software calling convention uses register x1 to hold the return -address for a call, with register x5 available as an alternate link -register. The standard calling convention uses register x2 as the -stack pointer.

-
-
-

Hardware might choose to accelerate function calls and returns that use -x1 or x5. See the descriptions of the JAL and JALR instructions.

-
-
-

The optional compressed 16-bit instruction format is designed around the -assumption that x1 is the return address register and x2 is the -stack pointer. Software using other conventions will operate correctly -but may have greater code size.

-
-
-

The number of available architectural registers can have large impacts -on code size, performance, and energy consumption. Although 16 registers -would arguably be sufficient for an integer ISA running compiled code, -it is impossible to encode a complete ISA with 16 registers in 16-bit -instructions using a 3-address format. Although a 2-address format would -be possible, it would increase instruction count and lower efficiency. -We wanted to avoid intermediate instruction sizes (such as Xtensa’s -24-bit instructions) to simplify base hardware implementations, and once -a 32-bit instruction size was adopted, it was straightforward to support -32 integer registers. A larger number of integer registers also helps -performance on high-performance code, where there can be extensive use -of loop unrolling, software pipelining, and cache tiling.

-
-
-

For these reasons, we chose a conventional size of 32 integer registers -for RV32I. Dynamic register usage tends to be dominated by a few -frequently accessed registers, and regfile implementations can be -optimized to reduce access energy for the frequently accessed -registers (Tseng & Asanović, 2000). The optional compressed 16-bit instruction format mostly -only accesses 8 registers and hence can provide a dense instruction -encoding, while additional instruction-set extensions could support a -much larger register space (either flat or hierarchical) if desired.

-
-
-

For resource-constrained embedded applications, we have defined the -RV32E subset, which only has 16 registers -(Chapter 3).

-
-
-
-
-
-

2.2. Base Instruction Formats

-
-

In the base RV32I ISA, there are four core instruction formats -(R/I/S/U), as shown in Base instruction formats. All are a fixed 32 -bits in length. The base ISA has IALIGN=32, meaning that instructions must be aligned on a four-byte boundary in memory. An -instruction-address-misaligned exception is generated on a taken branch -or unconditional jump if the target address is not IALIGN-bit aligned. -This exception is reported on the branch or jump instruction, not on the -target instruction. No instruction-address-misaligned exception is -generated for a conditional branch that is not taken.

-
-
- - - - - -
- - -
-

The alignment constraint for base ISA instructions is relaxed to a -two-byte boundary when instruction extensions with 16-bit lengths or -other odd multiples of 16-bit lengths are added (i.e., IALIGN=16).

-
-
-

Instruction-address-misaligned exceptions are reported on the branch or -jump that would cause instruction misalignment to help debugging, and to -simplify hardware design for systems with IALIGN=32, where these are the -only places where misalignment can occur.

-
-
-
-
-

The behavior upon decoding a reserved instruction is UNSPECIFIED.

-
-
- - - - - -
- - -
-

Some platforms may require that opcodes reserved for standard use raise -an illegal-instruction exception. Other platforms may permit reserved -opcode space be used for non-conforming extensions.

-
-
-
-
-

The RISC-V ISA keeps the source (rs1 and rs2) and destination (rd) -registers at the same position in all formats to simplify decoding. -Except for the 5-bit immediates used in CSR instructions -(Chapter 7), immediates are always -sign-extended, and are generally packed towards the leftmost available -bits in the instruction and have been allocated to reduce hardware -complexity. In particular, the sign bit for all immediates is always in -bit 31 of the instruction to speed sign-extension circuitry.

-
-
-
-Diagram -
-
-
-
-Diagram -
-
-
-
-Diagram -
-
-
-
-Diagram -
-
-
-

RISC-V base instruction formats. Each immediate subfield is labeled with the bit position (imm[x]) in the immediate value being produced, rather than the bit position within the instruction’s immediate field as is usually done.

-
-
- - - - - -
- - -
-

Decoding register specifiers is usually on the critical paths in -implementations, and so the instruction format was chosen to keep all -register specifiers at the same position in all formats at the expense -of having to move immediate bits across formats (a property shared with -RISC-IV aka. SPUR (Lee et al., 1989)).

-
-
-

In practice, most immediates are either small or require all XLEN bits. -We chose an asymmetric immediate split (12 bits in regular instructions -plus a special load-upper-immediate instruction with 20 bits) to -increase the opcode space available for regular instructions.

-
-
-

Immediates are sign-extended because we did not observe a benefit to -using zero extension for some immediates as in the MIPS ISA and wanted -to keep the ISA as simple as possible.

-
-
-
-
-
-

2.3. Immediate Encoding Variants

-
-

There are a further two variants of the instruction formats (B/J) based -on the handling of immediates, as shown in Base instruction formats immediate variants..

-
-
-
-Diagram -
-
-
-
-Diagram -
-
-
-
-Diagram -
-
-
-
-Diagram -
-
-
-
-Diagram -
-
-
-
-Diagram -
-
-
-

The only difference between the S and B formats is that the 12-bit -immediate field is used to encode branch offsets in multiples of 2 in -the B format. Instead of shifting all bits in the instruction-encoded -immediate left by one in hardware as is conventionally done, the middle -bits (imm[10:1]) and sign bit stay in fixed positions, while the lowest -bit in S format (inst[7]) encodes a high-order bit in B format.

-
-
-

Similarly, the only difference between the U and J formats is that the -20-bit immediate is shifted left by 12 bits to form U immediates and by -1 bit to form J immediates. The location of instruction bits in the U -and J format immediates is chosen to maximize overlap with the other -formats and with each other.

-
-
-

Immediate types shows the immediates produced by -each of the base instruction formats, and is labeled to show which -instruction bit (inst[y]) produces each bit of the immediate value.

-
-
-
-Diagram -
-
Figure 1. Types of immediate produced by RISC-V instructions.
-
-
-
-Diagram -
-
-
-
-Diagram -
-
-
-
-Diagram -
-
-
-
-Diagram -
-
-
-

The fields are labeled with the instruction bits used to construct their value. Sign extensions always uses inst[31].

-
-
- - - - - -
- - -
-

Sign extension is one of the most critical operations on immediates -(particularly for XLEN>32), and in RISC-V the sign bit for -all immediates is always held in bit 31 of the instruction to allow -sign extension to proceed in parallel with instruction decoding.

-
-
-

Although more complex implementations might have separate adders for -branch and jump calculations and so would not benefit from keeping the -location of immediate bits constant across types of instruction, we -wanted to reduce the hardware cost of the simplest implementations. By -rotating bits in the instruction encoding of B and J immediates instead -of using dynamic hardware muxes to multiply the immediate by 2, we -reduce instruction signal fanout and immediate mux costs by around a -factor of 2. The scrambled immediate encoding will add negligible time -to static or ahead-of-time compilation. For dynamic generation of -instructions, there is some small additional overhead, but the most -common short forward branches have straightforward immediate encodings.

-
-
-
-
-
-

2.4. Integer Computational Instructions

-
-

Most integer computational instructions operate on XLEN bits of values -held in the integer register file. Integer computational instructions -are either encoded as register-immediate operations using the I-type -format or as register-register operations using the R-type format. The -destination is register rd for both register-immediate and -register-register instructions. No integer computational instructions -cause arithmetic exceptions.

-
-
- - - - - -
- - -
-

We did not include special instruction-set support for overflow checks -on integer arithmetic operations in the base instruction set, as many -overflow checks can be cheaply implemented using RISC-V branches. -Overflow checking for unsigned addition requires only a single -additional branch instruction after the addition: -add t0, t1, t2; bltu t0, t1, overflow.

-
-
-

For signed addition, if one operand’s sign is known, overflow checking -requires only a single branch after the addition: -addi t0, t1, +imm; blt t0, t1, overflow. This covers the common case -of addition with an immediate operand.

-
-
-

For general signed addition, three additional instructions after the -addition are required, leveraging the observation that the sum should be -less than one of the operands if and only if the other operand is -negative.

-
-
-
-
         add t0, t1, t2
-         slti t3, t2, 0
-         slt t4, t0, t1
-         bne t3, t4, overflow
-
-
-
-

In RV64I, checks of 32-bit signed additions can be optimized further by -comparing the results of ADD and ADDW on the operands.

-
-
-
-
-

2.4.1. Integer Register-Immediate Instructions

-
-
-Diagram -
-
-
-

ADDI adds the sign-extended 12-bit immediate to register rs1. -Arithmetic overflow is ignored and the result is simply the low XLEN -bits of the result. ADDI rd, rs1, 0 is used to implement the MV rd, -rs1 assembler pseudoinstruction.

-
-
-

SLTI (set less than immediate) places the value 1 in register rd if -register rs1 is less than the sign-extended immediate when both are -treated as signed numbers, else 0 is written to rd. SLTIU is similar -but compares the values as unsigned numbers (i.e., the immediate is -first sign-extended to XLEN bits then treated as an unsigned number). -Note, SLTIU rd, rs1, 1 sets rd to 1 if rs1 equals zero, otherwise -sets rd to 0 (assembler pseudoinstruction SEQZ rd, rs).

-
-
-

ANDI, ORI, XORI are logical operations that perform bitwise AND, OR, and -XOR on register rs1 and the sign-extended 12-bit immediate and place -the result in rd. Note, XORI rd, rs1, -1 performs a bitwise logical -inversion of register rs1 (assembler pseudoinstruction NOT rd, rs).

-
-
-
-Diagram -
-
-
-

Shifts by a constant are encoded as a specialization of the I-type -format. The operand to be shifted is in rs1, and the shift amount is -encoded in the lower 5 bits of the I-immediate field. The right shift -type is encoded in bit 30. SLLI is a logical left shift (zeros are -shifted into the lower bits); SRLI is a logical right shift (zeros are -shifted into the upper bits); and SRAI is an arithmetic right shift (the -original sign bit is copied into the vacated upper bits).

-
-
-
-Diagram -
-
-
-

LUI (load upper immediate) is used to build 32-bit constants and uses -the U-type format. LUI places the 32-bit U-immediate value into the -destination register rd, filling in the lowest 12 bits with zeros.

-
-
-

AUIPC (add upper immediate to pc) is used to build pc-relative -addresses and uses the U-type format. AUIPC forms a 32-bit offset from -the U-immediate, filling in the lowest 12 bits with zeros, adds this -offset to the address of the AUIPC instruction, then places the result -in register rd.

-
-
- - - - - -
- - -
-

The assembly syntax for lui and auipc does not represent the lower -12 bits of the U-immediate, which are always zero.

-
-
-

The AUIPC instruction supports two-instruction sequences to access -arbitrary offsets from the PC for both control-flow transfers and data -accesses. The combination of an AUIPC and the 12-bit immediate in a JALR -can transfer control to any 32-bit PC-relative address, while an AUIPC -plus the 12-bit immediate offset in regular load or store instructions -can access any 32-bit PC-relative data address.

-
-
-

The current PC can be obtained by setting the U-immediate to 0. Although -a JAL +4 instruction could also be used to obtain the local PC (of the -instruction following the JAL), it might cause pipeline breaks in -simpler microarchitectures or pollute BTB structures in more complex -microarchitectures.

-
-
-
-
-
-

2.4.2. Integer Register-Register Operations

-
-

RV32I defines several arithmetic R-type operations. All operations read -the rs1 and rs2 registers as source operands and write the result -into register rd. The funct7 and funct3 fields select the type of -operation.

-
-
-
-Diagram -
-
-
-

ADD performs the addition of rs1 and rs2. SUB performs the -subtraction of rs2 from rs1. Overflows are ignored and the low XLEN -bits of results are written to the destination rd. SLT and SLTU -perform signed and unsigned compares respectively, writing 1 to rd if -rs1 < rs2, 0 otherwise. Note, SLTU rd, x0, rs2 sets rd to 1 if -rs2 is not equal to zero, otherwise sets rd to zero (assembler -pseudoinstruction SNEZ rd, rs). AND, OR, and XOR perform bitwise -logical operations.

-
-
-

SLL, SRL, and SRA perform logical left, logical right, and arithmetic -right shifts on the value in register rs1 by the shift amount held in -the lower 5 bits of register rs2.

-
-
-
-

2.4.3. NOP Instruction

-
-
-Diagram -
-
-
-

The NOP instruction does not change any architecturally visible state, -except for advancing the pc and incrementing any applicable -performance counters. NOP is encoded as ADDI x0, x0, 0.

-
-
- - - - - -
- - -
-

NOPs can be used to align code segments to microarchitecturally -significant address boundaries, or to leave space for inline code -modifications. Although there are many possible ways to encode a NOP, we -define a canonical NOP encoding to allow microarchitectural -optimizations as well as for more readable disassembly output. The other -NOP encodings are made available for HINT Instructions.

-
-
-

ADDI was chosen for the NOP encoding as this is most likely to take -fewest resources to execute across a range of systems (if not optimized -away in decode). In particular, the instruction only reads one register. -Also, an ADDI functional unit is more likely to be available in a -superscalar design as adds are the most common operation. In particular, -address-generation functional units can execute ADDI using the same -hardware needed for base+offset address calculations, while -register-register ADD or logical/shift operations require additional -hardware.

-
-
-
-
-
-
-

2.5. Control Transfer Instructions

-
-

RV32I provides two types of control transfer instructions: unconditional -jumps and conditional branches. Control transfer instructions in RV32I -do not have architecturally visible delay slots.

-
-
-

If an instruction access-fault or instruction page-fault exception -occurs on the target of a jump or taken branch, the exception is -reported on the target instruction, not on the jump or branch -instruction.

-
-
-

2.5.1. Unconditional Jumps

-
-

The jump and link (JAL) instruction uses the J-type format, where the -J-immediate encodes a signed offset in multiples of 2 bytes. The offset -is sign-extended and added to the address of the jump instruction to -form the jump target address. Jumps can therefore target a -±1 MiB range. JAL stores the address of the instruction -following the jump ('pc'+4) into register rd. The standard software -calling convention uses 'x1' as the return address register and 'x5' as -an alternate link register.

-
-
- - - - - -
- - -
-

The alternate link register supports calling millicode routines (e.g., -those to save and restore registers in compressed code) while preserving -the regular return address register. The register x5 was chosen as the -alternate link register as it maps to a temporary in the standard -calling convention, and has an encoding that is only one bit different -than the regular link register.

-
-
-
-
-

Plain unconditional jumps (assembler pseudoinstruction J) are encoded as -a JAL with rd=x0.

-
-
-
-Diagram -
-
-
-

The indirect jump instruction JALR (jump and link register) uses the -I-type encoding. The target address is obtained by adding the -sign-extended 12-bit I-immediate to the register rs1, then setting the -least-significant bit of the result to zero. The address of the -instruction following the jump (pc+4) is written to register rd. -Register x0 can be used as the destination if the result is not -required.

-
-
-
-Diagram -
-
-
- - - - - -
- - -
-

The unconditional jump instructions all use PC-relative addressing to -help support position-independent code. The JALR instruction was defined -to enable a two-instruction sequence to jump anywhere in a 32-bit -absolute address range. A LUI instruction can first load rs1 with the -upper 20 bits of a target address, then JALR can add in the lower bits. -Similarly, AUIPC then JALR can jump anywhere in a 32-bit pc-relative -address range.

-
-
-

Note that the JALR instruction does not treat the 12-bit immediate as -multiples of 2 bytes, unlike the conditional branch instructions. This -avoids one more immediate format in hardware. In practice, most uses of -JALR will have either a zero immediate or be paired with a LUI or AUIPC, -so the slight reduction in range is not significant.

-
-
-

Clearing the least-significant bit when calculating the JALR target -address both simplifies the hardware slightly and allows the low bit of -function pointers to be used to store auxiliary information. Although -there is potentially a slight loss of error checking in this case, in -practice jumps to an incorrect instruction address will usually quickly -raise an exception.

-
-
-

When used with a base rs1=x0, JALR can be used to -implement a single instruction subroutine call to the lowest or highest -address region from anywhere in the address space, which could be used -to implement fast calls to a small runtime library. Alternatively, an -ABI could dedicate a general-purpose register to point to a library -elsewhere in the address space.

-
-
-
-
-

The JAL and JALR instructions will generate an -instruction-address-misaligned exception if the target address is not -aligned to a four-byte boundary.

-
-
- - - - - -
- - -
-

Instruction-address-misaligned exceptions are not possible on machines -that support extensions with 16-bit aligned instructions, such as the -compressed instruction-set extension, C.

-
-
-
-
-

Return-address prediction stacks are a common feature of -high-performance instruction-fetch units, but require accurate detection -of instructions used for procedure calls and returns to be effective. -For RISC-V, hints as to the instructions' usage are encoded implicitly -via the register numbers used. A JAL instruction should push the return -address onto a return-address stack (RAS) only when rd is 'x1' or -x5. JALR instructions should push/pop a RAS as shown in Table 4.

-
- - ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 4. Return-address stack prediction hints encoded in the register operands of a JALR instruction.
rd is x1/x5rs1 is x1/x5rd=rs1RAS action

No

No

 — 

None

No

Yes

 — 

Pop

Yes

No

 — 

Push

Yes

Yes

No

Pop, then push

Yes

Yes

Yes

Push

-
- - - - - -
- - -
-

Some other ISAs added explicit hint bits to their indirect-jump -instructions to guide return-address stack manipulation. We use implicit -hinting tied to register numbers and the calling convention to reduce -the encoding space used for these hints.

-
-
-

When two different link registers (x1 and x5) are given as rs1 and -rd, then the RAS is both popped and pushed to support coroutines. If -rs1 and rd are the same link register (either x1 or x5), the RAS -is only pushed to enable macro-op fusion of the sequences: -lui ra, imm20; jalr ra, imm12(ra)_ and _auipc ra, imm20; jalr ra, imm12(ra)

-
-
-
-
-
-

2.5.2. Conditional Branches

-
-

All branch instructions use the B-type instruction format. The 12-bit -B-immediate encodes signed offsets in multiples of 2 bytes. The offset -is sign-extended and added to the address of the branch instruction to -give the target address. The conditional branch range is -±4 KiB.

-
-
-
-Diagram -
-
-
-

Branch instructions compare two registers. BEQ and BNE take the branch -if registers rs1 and rs2 are equal or unequal respectively. BLT and -BLTU take the branch if rs1 is less than rs2, using signed and -unsigned comparison respectively. BGE and BGEU take the branch if rs1 -is greater than or equal to rs2, using signed and unsigned comparison -respectively. Note, BGT, BGTU, BLE, and BLEU can be synthesized by -reversing the operands to BLT, BLTU, BGE, and BGEU, respectively.

-
-
- - - - - -
- - -
-

Signed array bounds may be checked with a single BLTU instruction, since -any negative index will compare greater than any nonnegative bound.

-
-
-
-
-

Software should be optimized such that the sequential code path is the -most common path, with less-frequently taken code paths placed out of -line. Software should also assume that backward branches will be -predicted taken and forward branches as not taken, at least the first -time they are encountered. Dynamic predictors should quickly learn any -predictable branch behavior.

-
-
-

Unlike some other architectures, the RISC-V jump (JAL with rd=x0) -instruction should always be used for unconditional branches instead of -a conditional branch instruction with an always-true condition. RISC-V -jumps are also PC-relative and support a much wider offset range than -branches, and will not pollute conditional-branch prediction tables.

-
-
- - - - - -
- - -
-

The conditional branches were designed to include arithmetic comparison -operations between two registers (as also done in PA-RISC, Xtensa, and -MIPS R6), rather than use condition codes (x86, ARM, SPARC, PowerPC), or -to only compare one register against zero (Alpha, MIPS), or two -registers only for equality (MIPS). This design was motivated by the -observation that a combined compare-and-branch instruction fits into a -regular pipeline, avoids additional condition code state or use of a -temporary register, and reduces static code size and dynamic instruction -fetch traffic. Another point is that comparisons against zero require -non-trivial circuit delay (especially after the move to static logic in -advanced processes) and so are almost as expensive as arithmetic -magnitude compares. Another advantage of a fused compare-and-branch -instruction is that branches are observed earlier in the front-end -instruction stream, and so can be predicted earlier. There is perhaps an -advantage to a design with condition codes in the case where multiple -branches can be taken based on the same condition codes, but we believe -this case to be relatively rare.

-
-
-

We considered but did not include static branch hints in the instruction -encoding. These can reduce the pressure on dynamic predictors, but -require more instruction encoding space and software profiling for best -results, and can result in poor performance if production runs do not -match profiling runs.

-
-
-

We considered but did not include conditional moves or predicated -instructions, which can effectively replace unpredictable short forward -branches. Conditional moves are the simpler of the two, but are -difficult to use with conditional code that might cause exceptions -(memory accesses and floating-point operations). Predication adds -additional flag state to a system, additional instructions to set and -clear flags, and additional encoding overhead on every instruction. Both -conditional move and predicated instructions add complexity to -out-of-order microarchitectures, adding an implicit third source operand -due to the need to copy the original value of the destination -architectural register into the renamed destination physical register if -the predicate is false. Also, static compile-time decisions to use -predication instead of branches can result in lower performance on -inputs not included in the compiler training set, especially given that -unpredictable branches are rare, and becoming rarer as branch prediction -techniques improve.

-
-
-

We note that various microarchitectural techniques exist to dynamically -convert unpredictable short forward branches into internally predicated -code to avoid the cost of flushing pipelines on a branch mispredict (Heil & Smith, 1996), (Klauser et al., 1998), (Kim et al., 2005) and -have been implemented in commercial processors (Sinharoy et al., 2011). The simplest techniques -just reduce the penalty of recovering from a mispredicted short forward -branch by only flushing instructions in the branch shadow instead of the -entire fetch pipeline, or by fetching instructions from both sides using -wide instruction fetch or idle instruction fetch slots. More complex -techniques for out-of-order cores add internal predicates on -instructions in the branch shadow, with the internal predicate value -written by the branch instruction, allowing the branch and following -instructions to be executed speculatively and out-of-order with respect -to other code.

-
-
-
-
-

The conditional branch instructions will generate an -instruction-address-misaligned exception if the target address is not -aligned to a four-byte boundary and the branch condition evaluates to -true. If the branch condition evaluates to false, the -instruction-address-misaligned exception will not be raised.

-
-
- - - - - -
- - -
-

Instruction-address-misaligned exceptions are not possible on machines -that support extensions with 16-bit aligned instructions, such as the -compressed instruction-set extension, C.

-
-
-
-
-
-
-

2.6. Load and Store Instructions

-
-

RV32I is a load-store architecture, where only load and store -instructions access memory and arithmetic instructions only operate on -CPU registers. RV32I provides a 32-bit address space that is -byte-addressed. The EEI will define what portions of the address space -are legal to access with which instructions (e.g., some addresses might -be read only, or support word access only). Loads with a destination of -x0 must still raise any exceptions and cause any other side effects -even though the load value is discarded.

-
-
-

The EEI will define whether the memory system is little-endian or -big-endian. In RISC-V, endianness is byte-address invariant.

-
-
- - - - - -
- - -
-

In a system for which endianness is byte-address invariant, the -following property holds: if a byte is stored to memory at some address -in some endianness, then a byte-sized load from that address in any -endianness returns the stored value.

-
-
-

In a little-endian configuration, multibyte stores write the -least-significant register byte at the lowest memory byte address, -followed by the other register bytes in ascending order of their -significance. Loads similarly transfer the contents of the lesser memory -byte addresses to the less-significant register bytes.

-
-
-

In a big-endian configuration, multibyte stores write the -most-significant register byte at the lowest memory byte address, -followed by the other register bytes in descending order of their -significance. Loads similarly transfer the contents of the greater -memory byte addresses to the less-significant register bytes.

-
-
-
-
-
-Diagram -
-
-
-
-Diagram -
-
-
-

Load and store instructions transfer a value between the registers and -memory. Loads are encoded in the I-type format and stores are S-type. -The effective address is obtained by adding register rs1 to the -sign-extended 12-bit offset. Loads copy a value from memory to register -rd. Stores copy the value in register rs2 to memory.

-
-
-

The LW instruction loads a 32-bit value from memory into rd. LH loads -a 16-bit value from memory, then sign-extends to 32-bits before storing -in rd. LHU loads a 16-bit value from memory but then zero extends to -32-bits before storing in rd. LB and LBU are defined analogously for -8-bit values. The SW, SH, and SB instructions store 32-bit, 16-bit, and -8-bit values from the low bits of register rs2 to memory.

-
-
-

Regardless of EEI, loads and stores whose effective addresses are -naturally aligned shall not raise an address-misaligned exception. Loads -and stores whose effective address is not naturally aligned to the -referenced datatype (i.e., the effective address is not divisible by the -size of the access in bytes) have behavior dependent on the EEI.

-
-
-

An EEI may guarantee that misaligned loads and stores are fully -supported, and so the software running inside the execution environment -will never experience a contained or fatal address-misaligned trap. In -this case, the misaligned loads and stores can be handled in hardware, -or via an invisible trap into the execution environment implementation, -or possibly a combination of hardware and invisible trap depending on -address.

-
-
-

An EEI may not guarantee misaligned loads and stores are handled -invisibly. In this case, loads and stores that are not naturally aligned -may either complete execution successfully or raise an exception. The -exception raised can be either an address-misaligned exception or an -access-fault exception. For a memory access that would otherwise be able -to complete except for the misalignment, an access-fault exception can -be raised instead of an address-misaligned exception if the misaligned -access should not be emulated, e.g., if accesses to the memory region -have side effects. When an EEI does not guarantee misaligned loads and -stores are handled invisibly, the EEI must define if exceptions caused -by address misalignment result in a contained trap (allowing software -running inside the execution environment to handle the trap) or a fatal -trap (terminating execution).

-
-
- - - - - -
- - -
-

Misaligned accesses are occasionally required when porting legacy code, -and help performance on applications when using any form of packed-SIMD -extension or handling externally packed data structures. Our rationale -for allowing EEIs to choose to support misaligned accesses via the -regular load and store instructions is to simplify the addition of -misaligned hardware support. One option would have been to disallow -misaligned accesses in the base ISAs and then provide some separate ISA -support for misaligned accesses, either special instructions to help -software handle misaligned accesses or a new hardware addressing mode -for misaligned accesses. Special instructions are difficult to use, -complicate the ISA, and often add new processor state (e.g., SPARC VIS -align address offset register) or complicate access to existing -processor state (e.g., MIPS LWL/LWR partial register writes). In -addition, for loop-oriented packed-SIMD code, the extra overhead when -operands are misaligned motivates software to provide multiple forms of -loop depending on operand alignment, which complicates code generation -and adds to loop startup overhead. New misaligned hardware addressing -modes take considerable space in the instruction encoding or require -very simplified addressing modes (e.g., register indirect only).

-
-
-
-
-

Even when misaligned loads and stores complete successfully, these -accesses might run extremely slowly depending on the implementation -(e.g., when implemented via an invisible trap). Furthermore, whereas -naturally aligned loads and stores are guaranteed to execute atomically, -misaligned loads and stores might not, and hence require additional -synchronization to ensure atomicity.

-
-
- - - - - -
- - -
-

We do not mandate atomicity for misaligned accesses so execution -environment implementations can use an invisible machine trap and a -software handler to handle some or all misaligned accesses. If hardware -misaligned support is provided, software can exploit this by simply -using regular load and store instructions. Hardware can then -automatically optimize accesses depending on whether runtime addresses -are aligned.

-
-
-
-
-
-

2.7. Memory Ordering Instructions

-
-
-mem order -
-
-
-

The FENCE instruction is used to order device I/O and memory accesses as -viewed by other RISC-V harts and external devices or coprocessors. Any -combination of device input (I), device output (O), memory reads (R), -and memory writes (W) may be ordered with respect to any combination of -the same. Informally, no other RISC-V hart or external device can -observe any operation in the successor set following a FENCE before -any operation in the predecessor set preceding the FENCE. -Chapter 18 provides a precise description -of the RISC-V memory consistency model.

-
-
-

The FENCE instruction also orders memory reads and writes made by the -hart as observed by memory reads and writes made by an external device. -However, FENCE does not order observations of events made by an external -device using any other signaling mechanism.

-
-
- - - - - -
- - -
-

A device might observe an access to a memory location via some external -communication mechanism, e.g., a memory-mapped control register that -drives an interrupt signal to an interrupt controller. This -communication is outside the scope of the FENCE ordering mechanism and -hence the FENCE instruction can provide no guarantee on when a change in -the interrupt signal is visible to the interrupt controller. Specific -devices might provide additional ordering guarantees to reduce software -overhead but those are outside the scope of the RISC-V memory model.

-
-
-
-
-

The EEI will define what I/O operations are possible, and in particular, -which memory addresses when accessed by load and store instructions will -be treated and ordered as device input and device output operations -respectively rather than memory reads and writes. For example, -memory-mapped I/O devices will typically be accessed with uncached loads -and stores that are ordered using the I and O bits rather than the R and -W bits. Instruction-set extensions might also describe new I/O -instructions that will also be ordered using the I and O bits in a -FENCE.

-
- - ----- - - - - - - - - - - - - - - - - - - - - - - - -
Table 5. Fence mode encoding
fm fieldMnemonicMeaning

0000

none

Normal Fence

1000

TSO

With FENCE RW,RW: exclude write-to-read ordering; otherwise: Reserved for future use.

other

Reserved for future use.

-
-

The fence mode field fm defines the semantics of the FENCE. A FENCE -with fm=0000 orders all memory operations in its predecessor set -before all memory operations in its successor set.

-
-
-

The FENCE.TSO instruction is encoded as a FENCE instruction -with fm=1000, predecessor=RW, and successor=RW. FENCE.TSO orders -all load operations in its predecessor set before all memory operations -in its successor set, and all store operations in its predecessor set -before all store operations in its successor set. This leaves non-AMO -store operations in the FENCE.TSO’s predecessor set unordered with -non-AMO loads in its successor set.

-
-
- - - - - -
- - -
-

Because FENCE RW,RW imposes a superset of the orderings that FENCE.TSO -imposes, it is correct to ignore the fm field and implement FENCE.TSO as FENCE RW,RW.

-
-
-
-
-

The unused fields in the FENCE instructions--rs1 and rd--are reserved -for finer-grain fences in future extensions. For forward compatibility, -base implementations shall ignore these fields, and standard software -shall zero these fields. Likewise, many fm and predecessor/successor -set settings in Table 5 are also reserved for future use. -Base implementations shall treat all such reserved configurations as -normal fences with fm=0000, and standard software shall use only -non-reserved configurations.

-
-
- - - - - -
- - -
-

We chose a relaxed memory model to allow high performance from simple -machine implementations and from likely future coprocessor or -accelerator extensions. We separate out I/O ordering from memory R/W -ordering to avoid unnecessary serialization within a device-driver hart -and also to support alternative non-memory paths to control added -coprocessors or I/O devices. Simple implementations may additionally -ignore the predecessor and successor fields and always execute a -conservative fence on all operations.

-
-
-
-
-
-

2.8. Environment Call and Breakpoints

-
-

SYSTEM instructions are used to access system functionality that might -require privileged access and are encoded using the I-type instruction -format. These can be divided into two main classes: those that -atomically read-modify-write control and status registers (CSRs), and -all other potentially privileged instructions. CSR instructions are -described in Chapter 7, and the base -unprivileged instructions are described in the following section.

-
-
- - - - - -
- - -
-

The SYSTEM instructions are defined to allow simpler implementations to -always trap to a single software trap handler. More sophisticated -implementations might execute more of each system instruction in -hardware.

-
-
-
-
-
-Diagram -
-
-
-

These two instructions cause a precise requested trap to the supporting -execution environment.

-
-
-

The ECALL instruction is used to make a service request to the execution -environment. The EEI will define how parameters for the service request -are passed, but usually these will be in defined locations in the -integer register file.

-
-
-

The EBREAK instruction is used to return control to a debugging -environment.

-
-
- - - - - -
- - -
-

ECALL and EBREAK were previously named SCALL and SBREAK. The -instructions have the same functionality and encoding, but were renamed -to reflect that they can be used more generally than to call a -supervisor-level operating system or debugger.

-
-
-
-
- - - - - -
- - -
-

EBREAK was primarily designed to be used by a debugger to cause -execution to stop and fall back into the debugger. EBREAK is also used -by the standard gcc compiler to mark code paths that should not be -executed.

-
-
-

Another use of EBREAK is to support "semihosting", where the execution -environment includes a debugger that can provide services over an -alternate system call interface built around the EBREAK instruction. -Because the RISC-V base ISAs do not provide more than one EBREAK -instruction, RISC-V semihosting uses a special sequence of instructions -to distinguish a semihosting EBREAK from a debugger inserted EBREAK.

-
-
-
-
    slli x0, x0, 0x1f   # Entry NOP
-    ebreak              # Break to debugger
-    srai x0, x0, 7      # NOP encoding the semihosting call number 7
-
-
-
-

Note that these three instructions must be 32-bit-wide instructions, -i.e., they mustn’t be among the compressed 16-bit instructions described -in Chapter 28.

-
-
-

The shift NOP instructions are still considered available for use as -HINTs.

-
-
-

Semihosting is a form of service call and would be more naturally -encoded as an ECALL using an existing ABI, but this would require the -debugger to be able to intercept ECALLs, which is a newer addition to -the debug standard. We intend to move over to using ECALLs with a -standard ABI, in which case, semihosting can share a service ABI with an -existing standard.

-
-
-

We note that ARM processors have also moved to using SVC instead of BKPT -for semihosting calls in newer designs.

-
-
-
-
-
-

2.9. HINT Instructions

-
-

RV32I reserves a large encoding space for HINT instructions, which are -usually used to communicate performance hints to the microarchitecture. -Like the NOP instruction, HINTs do not change any architecturally -visible state, except for advancing the pc and any applicable -performance counters. Implementations are always allowed to ignore the -encoded hints.

-
-
-

Most RV32I HINTs are encoded as integer computational instructions with -rd=x0. The other RV32I HINTs are encoded as FENCE instructions with -a null predecessor or successor set and with fm=0.

-
-
- - - - - -
- - -
-

These HINT encodings have been chosen so that simple implementations can -ignore HINTs altogether, and instead execute a HINT as a regular -instruction that happens not to mutate the architectural state. For -example, ADD is a HINT if the destination register is x0; the five-bit -rs1 and rs2 fields encode arguments to the HINT. However, a simple -implementation can simply execute the HINT as an ADD of rs1 and rs2 -that writes x0, which has no architecturally visible effect.

-
-
-

As another example, a FENCE instruction with a zero pred field and a -zero fm field is a HINT; the succ, rs1, and rd fields encode the -arguments to the HINT. A simple implementation can simply execute the -HINT as a FENCE that orders the null set of prior memory accesses before -whichever subsequent memory accesses are encoded in the succ field. -Since the intersection of the predecessor and successor sets is null, -the instruction imposes no memory orderings, and so it has no -architecturally visible effect.

-
-
-
-
-

Table 6 lists all RV32I HINT code points. 91% of the -HINT space is reserved for standard HINTs. The remainder of the HINT -space is designated for custom HINTs: no standard HINTs will ever be -defined in this subspace.

-
-
- - - - - -
- - -
-

We anticipate standard hints to eventually include memory-system spatial -and temporal locality hints, branch prediction hints, thread-scheduling -hints, security tags, and instrumentation flags for simulation/emulation.

-
-
-
- - ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 6. RV32I HINT instructions.
InstructionConstraintsCode PointsPurpose

LUI

rd=x0

stem 3d91b2575aebc0d742415f31a7138964

Designated for future standard use

AUIPC

rd=x0

stem 3d91b2575aebc0d742415f31a7138964

ADDI

rd=x0, and either rs1x0 or imm≠0

stem 0f73b0acd2f2b2250dca11c3c9ec1713

ANDI

rd=x0

stem cec3b648d9a79b7951e2288a4cb712cd

ORI

rd=x0

stem cec3b648d9a79b7951e2288a4cb712cd

XORI

rd=x0

stem cec3b648d9a79b7951e2288a4cb712cd

ADD

rd=x0, rs1x0

stem e68410fd305668d5d27741524c852b69

ADD

rd=x0, rs1=x0, rs2x2-x5

28

ADD

rd=x0, rs1=x0, rs2=x2-x5

4

(rs2=x2) NTL.P1
-(rs2=x3) NTL.PALL
-(rs2=x4) NTL.S1
-(rs2=x5) NTL.ALL

SUB

rd=x0

stem 560d002b3cbfbdc26102752336160ae5

Designated for future standard use

AND

rd=x0

stem 560d002b3cbfbdc26102752336160ae5

OR

rd=x0

stem 560d002b3cbfbdc26102752336160ae5

XOR

rd=x0

stem 560d002b3cbfbdc26102752336160ae5

SLL

rd=x0

stem 560d002b3cbfbdc26102752336160ae5

SRL

rd=x0

stem 560d002b3cbfbdc26102752336160ae5

SRA

rd=x0

stem 560d002b3cbfbdc26102752336160ae5

FENCE

rd=x0, rs1x0, fm=0, and either pred=0 or succ=0

stem 3be453f8a29ef791f6184cf706174789

FENCE

rdx0, rs1=x0, fm=0, and either pred=0 or succ=0

stem 3be453f8a29ef791f6184cf706174789

FENCE

rd=rs1=x0, fm=0, pred=0, succ≠0

15

FENCE

rd=rs1=x0, fm=0, pred≠W, succ=0

15

FENCE

rd=rs1=x0, fm=0, pred=W, succ=0

1

PAUSE

SLTI

rd=x0

stem cec3b648d9a79b7951e2288a4cb712cd

Designated for custom use

SLTIU

rd=x0

stem cec3b648d9a79b7951e2288a4cb712cd

SLLI

rd=x0

stem 560d002b3cbfbdc26102752336160ae5

SRLI

rd=x0

stem 560d002b3cbfbdc26102752336160ae5

SRAI

rd=x0

stem 560d002b3cbfbdc26102752336160ae5

SLT

rd=x0

stem 560d002b3cbfbdc26102752336160ae5

SLTU

rd=x0

stem 560d002b3cbfbdc26102752336160ae5

-
-
-
-
-

3. RV32E and RV64E Base Integer Instruction Sets, Version 2.0

-
-
-

CV32A65X: These instruction sets are not supported.

-
-
-
-
-

4. RV64I Base Integer Instruction Set, Version 2.1

-
-
-

CV32A65X: This instruction set is not supported.

-
-
-
-
-

5. RV128I Base Integer Instruction Set, Version 1.7

-
-
-

CV32A65X: This instruction set is not supported.

-
-
-
-
-

6. "Zifencei" Extension for Instruction-Fetch Fence, Version 2.0

-
-
-

CV32A65X: This extension is not supported.

-
-
-
-
-

7. "Zicsr", Extension for Control and Status Register (CSR) Instructions, Version 2.0

-
-
-

RISC-V defines a separate address space of 4096 Control and Status -registers associated with each hart. This chapter defines the full set -of CSR instructions that operate on these CSRs.

-
-
- - - - - -
- - -
-

While CSRs are primarily used by the privileged architecture, there are -several uses in unprivileged code including for counters and timers, and -for floating-point status.

-
-
-

The counters and timers are no longer considered mandatory parts of the -standard base ISAs, and so the CSR instructions required to access them -have been moved out of Chapter 2 into this separate -chapter.

-
-
-
-
-

7.1. CSR Instructions

-
-

All CSR instructions atomically read-modify-write a single CSR, whose -CSR specifier is encoded in the 12-bit csr field of the instruction -held in bits 31-20. The immediate forms use a 5-bit zero-extended -immediate encoded in the rs1 field.

-
-
-
-Diagram -
-
-
-

The CSRRW (Atomic Read/Write CSR) instruction atomically swaps values in -the CSRs and integer registers. CSRRW reads the old value of the CSR, -zero-extends the value to XLEN bits, then writes it to integer register -rd. The initial value in rs1 is written to the CSR. If rd=x0, -then the instruction shall not read the CSR and shall not cause any of -the side effects that might occur on a CSR read.

-
-
-

The CSRRS (Atomic Read and Set Bits in CSR) instruction reads the value -of the CSR, zero-extends the value to XLEN bits, and writes it to -integer register rd. The initial value in integer register rs1 is -treated as a bit mask that specifies bit positions to be set in the CSR. -Any bit that is high in rs1 will cause the corresponding bit to be set -in the CSR, if that CSR bit is writable.

-
-
-

The CSRRC (Atomic Read and Clear Bits in CSR) instruction reads the -value of the CSR, zero-extends the value to XLEN bits, and writes it to -integer register rd. The initial value in integer register rs1 is -treated as a bit mask that specifies bit positions to be cleared in the -CSR. Any bit that is high in rs1 will cause the corresponding bit to -be cleared in the CSR, if that CSR bit is writable.

-
-
-

For both CSRRS and CSRRC, if rs1=x0, then the instruction will not -write to the CSR at all, and so shall not cause any of the side effects -that might otherwise occur on a CSR write, nor raise illegal-instruction -exceptions on accesses to read-only CSRs. Both CSRRS and CSRRC always -read the addressed CSR and cause any read side effects regardless of -rs1 and rd fields. -Note that if rs1 specifies a register other than x0, and that register -holds a zero value, the instruction will not action any attendant per-field -side effects, but will action any side effects caused by writing to the entire -CSR.

-
-
-

A CSRRW with rs1=x0 will attempt to write zero to the destination CSR.

-
-
-

The CSRRWI, CSRRSI, and CSRRCI variants are similar to CSRRW, CSRRS, and -CSRRC respectively, except they update the CSR using an XLEN-bit value -obtained by zero-extending a 5-bit unsigned immediate (uimm[4:0]) field -encoded in the rs1 field instead of a value from an integer register. -For CSRRSI and CSRRCI, if the uimm[4:0] field is zero, then these -instructions will not write to the CSR, and shall not cause any of the -side effects that might otherwise occur on a CSR write, nor raise -illegal-instruction exceptions on accesses to read-only CSRs. For -CSRRWI, if rd=x0, then the instruction shall not read the CSR and -shall not cause any of the side effects that might occur on a CSR read. -Both CSRRSI and CSRRCI will always read the CSR and cause any read side -effects regardless of rd and rs1 fields.

-
- - ------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 7. Conditions determining whether a CSR instruction reads or writes the specified CSR.
Register operand

Instruction

rd is x0

rs1 is x0

Reads CSR

Writes CSR

CSRRW

Yes

-

No

Yes

CSRRW

No

-

Yes

Yes

CSRRS/CSRRC

-

Yes

Yes

No

CSRRS/CSRRC

-

No

Yes

Yes

Immediate operand

Instruction

rd is x0

uimmstem bf536b3e7d45040baab197f00bea9eba0

Reads CSR

Writes -CSR

CSRRWI

Yes

-

No

Yes

CSRRWI

No

-

Yes

Yes

CSRRSI/CSRRCI

-

Yes

Yes

No

CSRRSI/CSRRCI

-

No

Yes

Yes

-
-

Table 7 summarizes the behavior of the CSR -instructions with respect to whether they read and/or write the CSR.

-
-
-

In addition to side effects that occur as a consequence of reading or -writing a CSR, individual fields within a CSR might have side effects -when written. The CSRRW[I] instructions action side effects for all -such fields within the written CSR. The CSRRS[I] an CSRRC[I] instructions -only action side effects for fields for which the rs1 or uimm argument -has at least one bit set corresponding to that field.

-
-
- - - - - -
- - -
-

As of this writing, no standard CSRs have side effects on field writes. -Hence, whether a standard CSR access has any side effects can be determined -solely from the opcode.

-
-
-

Defining CSRs with side effects on field writes is not recommended.

-
-
-
-
-

For any event or consequence that occurs due to a CSR having a -particular value, if a write to the CSR gives it that value, the -resulting event or consequence is said to be an indirect effect of the -write. Indirect effects of a CSR write are not considered by the RISC-V -ISA to be side effects of that write.

-
-
- - - - - -
- - -
-

An example of side effects for CSR accesses would be if reading from a -specific CSR causes a light bulb to turn on, while writing an odd value -to the same CSR causes the light to turn off. Assume writing an even -value has no effect. In this case, both the read and write have side -effects controlling whether the bulb is lit, as this condition is not -determined solely from the CSR value. (Note that after writing an odd -value to the CSR to turn off the light, then reading to turn the light -on, writing again the same odd value causes the light to turn off again. -Hence, on the last write, it is not a change in the CSR value that turns -off the light.)

-
-
-

On the other hand, if a bulb is rigged to light whenever the value of a -particular CSR is odd, then turning the light on and off is not -considered a side effect of writing to the CSR but merely an indirect -effect of such writes.

-
-
-

More concretely, the RISC-V privileged architecture defined in Volume II -specifies that certain combinations of CSR values cause a trap to occur. -When an explicit write to a CSR creates the conditions that trigger the -trap, the trap is not considered a side effect of the write but merely -an indirect effect.

-
-
-

Standard CSRs do not have any side effects on reads. Standard CSRs may -have side effects on writes. Custom extensions might add CSRs for which -accesses have side effects on either reads or writes.

-
-
-
-
-

Some CSRs, such as the instructions-retired counter, instret, may be -modified as side effects of instruction execution. In these cases, if a -CSR access instruction reads a CSR, it reads the value prior to the -execution of the instruction. If a CSR access instruction writes such a -CSR, the explicit write is done instead of the update from the side effect. -In particular, a value -written to instret by one instruction will be the value read by the -following instruction.

-
-
-

The assembler pseudoinstruction to read a CSR, CSRR rd, csr, is -encoded as CSRRS rd, csr, x0. The assembler pseudoinstruction to write -a CSR, CSRW csr, rs1, is encoded as CSRRW x0, csr, rs1, while CSRWI -csr, uimm, is encoded as CSRRWI x0, csr, uimm.

-
-
-

Further assembler pseudoinstructions are defined to set and clear bits -in the CSR when the old value is not required: CSRS/CSRC csr, rs1; -CSRSI/CSRCI csr, uimm.

-
-
-

7.1.1. CSR Access Ordering

-
-

Each RISC-V hart normally observes its own CSR accesses, including its -implicit CSR accesses, as performed in program order. In particular, -unless specified otherwise, a CSR access is performed after the -execution of any prior instructions in program order whose behavior -modifies or is modified by the CSR state and before the execution of any -subsequent instructions in program order whose behavior modifies or is -modified by the CSR state. Furthermore, an explicit CSR read returns the -CSR state before the execution of the instruction, while an explicit CSR -write suppresses and overrides any implicit writes or modifications to -the same CSR by the same instruction.

-
-
-

Likewise, any side effects from an explicit CSR access are normally -observed to occur synchronously in program order. Unless specified -otherwise, the full consequences of any such side effects are observable -by the very next instruction, and no consequences may be observed -out-of-order by preceding instructions. (Note the distinction made -earlier between side effects and indirect effects of CSR writes.)

-
-
-

For the RVWMO memory consistency model (Chapter 18), CSR accesses are weakly -ordered by default, so other harts or devices may observe CSR accesses -in an order different from program order. In addition, CSR accesses are -not ordered with respect to explicit memory accesses, unless a CSR -access modifies the execution behavior of the instruction that performs -the explicit memory access or unless a CSR access and an explicit memory -access are ordered by either the syntactic dependencies defined by the -memory model or the ordering requirements defined by the Memory-Ordering -PMAs section in Volume II of this manual. To enforce ordering in all -other cases, software should execute a FENCE instruction between the -relevant accesses. For the purposes of the FENCE instruction, CSR read -accesses are classified as device input (I), and CSR write accesses are -classified as device output (O).

-
-
- - - - - -
- - -
-

Informally, the CSR space acts as a weakly ordered memory-mapped I/O -region, as defined by the Memory-Ordering PMAs section in Volume II of -this manual. As a result, the order of CSR accesses with respect to all -other accesses is constrained by the same mechanisms that constrain the -order of memory-mapped I/O accesses to such a region.

-
-
-

These CSR-ordering constraints are imposed to support ordering main -memory and memory-mapped I/O accesses with respect to CSR accesses that -are visible to, or affected by, devices or other harts. Examples include -the time, cycle, and mcycle CSRs, in addition to CSRs that reflect -pending interrupts, like mip and sip. Note that implicit reads of -such CSRs (e.g., taking an interrupt because of a change in mip) are -also ordered as device input.

-
-
-

Most CSRs (including, e.g., the fcsr) are not visible to other harts; -their accesses can be freely reordered in the global memory order with -respect to FENCE instructions without violating this specification.

-
-
-
-
-

The hardware platform may define that accesses to certain CSRs are -strongly ordered, as defined by the Memory-Ordering PMAs section in -Volume II of this manual. Accesses to strongly ordered CSRs have -stronger ordering constraints with respect to accesses to both weakly -ordered CSRs and accesses to memory-mapped I/O regions.

-
-
- - - - - -
- - -
-

The rules for the reordering of CSR accesses in the global memory order -should probably be moved to Chapter 18 concerning the RVWMO memory consistency model.

-
-
-
-
-
-
-
-
-

8. "Zicntr" and "Zihpm" Extensions for Counters, Version 2.0

-
-
-

RISC-V ISAs provide a set of up to thirty-two 64-bit performance -counters and timers that are accessible via unprivileged XLEN-bit -read-only CSR registers 0xC000xC1F (when XLEN=32, the upper 32 bits -are accessed via CSR registers 0xC800xC9F). These counters are -divided between the "Zicntr" and "Zihpm" extensions.

-
-
-

8.1. "Zicntr" Extension for Base Counters and Timers

-
-

The Zicntr standard extension comprises the first three of these -counters (CYCLE, TIME, and INSTRET), which have dedicated functions -(cycle count, real-time clock, and instructions retired, respectively). -The Zicntr extension depends on the Zicsr extension.

-
-
- - - - - -
- - -
-

We recommend provision of these basic counters in implementations as -they are essential for basic performance analysis, adaptive and dynamic -optimization, and to allow an application to work with real-time -streams. Additional counters in the separate Zihpm extension can help -diagnose performance problems and these should be made accessible from -user-level application code with low overhead.

-
-
-

Some execution environments might prohibit access to counters, for -example, to impede timing side-channel attacks.

-
-
-
-
-
-Diagram -
-
-
-

For base ISAs with XLEN≥64, CSR instructions can access -the full 64-bit CSRs directly. In particular, the RDCYCLE, RDTIME, and -RDINSTRET pseudoinstructions read the full 64 bits of the cycle, -time, and instret counters.

-
-
- - - - - -
- - -
-

The counter pseudoinstructions are mapped to the read-only -csrrs rd, counter, x0 canonical form, but the other read-only CSR -instruction forms (based on CSRRC/CSRRSI/CSRRCI) are also legal ways to -read these CSRs.

-
-
-
-
-

For base ISAs with XLEN=32, the Zicntr extension enables the three -64-bit read-only counters to be accessed in 32-bit pieces. The RDCYCLE, -RDTIME, and RDINSTRET pseudoinstructions provide the lower 32 bits, and -the RDCYCLEH, RDTIMEH, and RDINSTRETH pseudoinstructions provide the -upper 32 bits of the respective counters.

-
-
- - - - - -
- - -
-

We required the counters be 64 bits wide, even when XLEN=32, as -otherwise it is very difficult for software to determine if values have -overflowed. For a low-end implementation, the upper 32 bits of each -counter can be implemented using software counters incremented by a trap -handler triggered by overflow of the lower 32 bits. The sample code -given below shows how the full 64-bit width value can be safely read -using the individual 32-bit width pseudoinstructions.

-
-
-
-
-

The RDCYCLE pseudoinstruction reads the low XLEN bits of the cycle -CSR which holds a count of the number of clock cycles executed by the -processor core on which the hart is running from an arbitrary start time -in the past. RDCYCLEH is only present when XLEN=32 and reads bits 63-32 -of the same cycle counter. The underlying 64-bit counter should never -overflow in practice. The rate at which the cycle counter advances will -depend on the implementation and operating environment. The execution -environment should provide a means to determine the current rate -(cycles/second) at which the cycle counter is incrementing.

-
-
- - - - - -
- - -
-

RDCYCLE is intended to return the number of cycles executed by the -processor core, not the hart. Precisely defining what is a "core" is -difficult given some implementation choices (e.g., AMD Bulldozer). -Precisely defining what is a "clock cycle" is also difficult given the -range of implementations (including software emulations), but the intent -is that RDCYCLE is used for performance monitoring along with the other -performance counters. In particular, where there is one hart/core, one -would expect cycle-count/instructions-retired to measure CPI for a hart.

-
-
-

Cores don’t have to be exposed to software at all, and an implementor -might choose to pretend multiple harts on one physical core are running -on separate cores with one hart/core, and provide separate cycle -counters for each hart. This might make sense in a simple barrel -processor (e.g., CDC 6600 peripheral processors) where inter-hart timing -interactions are non-existent or minimal.

-
-
-

Where there is more than one hart/core and dynamic multithreading, it is -not generally possible to separate out cycles per hart (especially with -SMT). It might be possible to define a separate performance counter that -tried to capture the number of cycles a particular hart was running, but -this definition would have to be very fuzzy to cover all the possible -threading implementations. For example, should we only count cycles for -which any instruction was issued to execution for this hart, and/or -cycles any instruction retired, or include cycles this hart was -occupying machine resources but couldn’t execute due to stalls while -other harts went into execution? Likely, "all of the above" would be -needed to have understandable performance stats. This complexity of -defining a per-hart cycle count, and also the need in any case for a -total per-core cycle count when tuning multithreaded code led to just -standardizing the per-core cycle counter, which also happens to work -well for the common single hart/core case.

-
-
-

Standardizing what happens during "sleep" is not practical given that -what "sleep" means is not standardized across execution environments, -but if the entire core is paused (entirely clock-gated or powered-down -in deep sleep), then it is not executing clock cycles, and the cycle -count shouldn’t be increasing per the spec. There are many details, -e.g., whether clock cycles required to reset a processor after waking up -from a power-down event should be counted, and these are considered -execution-environment-specific details.

-
-
-

Even though there is no precise definition that works for all platforms, -this is still a useful facility for most platforms, and an imprecise, -common, "usually correct" standard here is better than no standard. -The intent of RDCYCLE was primarily performance monitoring/tuning, and -the specification was written with that goal in mind.

-
-
-
-
-

The RDTIME pseudoinstruction reads the low XLEN bits of the "time" CSR, -which counts wall-clock real time that has passed from an arbitrary -start time in the past. RDTIMEH is only present when XLEN=32 and reads -bits 63-32 of the same real-time counter. The underlying 64-bit counter -increments by one with each tick of the real-time clock, and, for -realistic real-time clock frequencies, should never overflow in -practice. The execution environment should provide a means of -determining the period of a counter tick (seconds/tick). The period -should be constant within a small error bound. The environment should -provide a means to determine the accuracy of the clock (i.e., the -maximum relative error between the nominal and actual real-time clock -periods).

-
-
- - - - - -
- - -
-

On some simple platforms, cycle count might represent a valid -implementation of RDTIME, in which case RDTIME and RDCYCLE may return -the same result.

-
-
-

It is difficult to provide a strict mandate on clock period given the -wide variety of possible implementation platforms. The maximum error -bound should be set based on the requirements of the platform.

-
-
-
-
-

The real-time clocks of all harts must be synchronized to within one -tick of the real-time clock.

-
-
- - - - - -
- - -
-

As with other architectural mandates, it suffices to appear "as if" -harts are synchronized to within one tick of the real-time clock, i.e., -software is unable to observe that there is a greater delta between the -real-time clock values observed on two harts.

-
-
-
-
-

The RDINSTRET pseudoinstruction reads the low XLEN bits of the -instret CSR, which counts the number of instructions retired by this -hart from some arbitrary start point in the past. RDINSTRETH is only -present when XLEN=32 and reads bits 63-32 of the same instruction -counter. The underlying 64-bit counter should never overflow in -practice.

-
-
- - - - - -
- - -
-

Instructions that cause synchronous exceptions, including ECALL and -EBREAK, are not considered to retire and hence do not increment the -instret CSR.

-
-
-
-
-

The following code sequence will read a valid 64-bit cycle counter value -into x3:x2, even if the counter overflows its lower half between -reading its upper and lower halves.

-
-
-
Listing 1. Sample code for reading the 64-bit cycle counter when XLEN=32.
-
-
    again:
-        rdcycleh     x3
-        rdcycle      x2
-        rdcycleh     x4
-        bne          x3, x4, again
-
-
-
-
-

8.2. "Zihpm" Extension for Hardware Performance Counters

-
-

CV32A65X: This extension is not supported.

-
-
-
-
-
-

9. "Zihintntl" Extension for Non-Temporal Locality Hints, Version 1.0

-
-
-

CV32A65X: This extension is not supported.

-
-
-
-
-

10. "Zihintpause" Extension for Pause Hint, Version 2.0

-
-
-

CV32A65X: This extension is not supported.

-
-
-
-
-

11. "Zimop" Extension for May-Be-Operations, Version 1.0

-
-
-

CV32A65X: This extension is not supported.

-
-
-

11.1. "Zcmop" Compressed May-Be-Operations Extension, Version 1.0

-
-

CV32A65X: This extension is not supported.

-
-
-
-
-
-

12. "Zicond" Extension for Integer Conditional Operations, Version 1.0.0

-
-
-

CV32A65X: This extension is not supported.

-
-
-
-
-

13. "M" Extension for Integer Multiplication and Division, Version 2.0

-
-
-

This chapter describes the standard integer multiplication and division -instruction extension, which is named "M" and contains instructions -that multiply or divide values held in two integer registers.

-
-
- - - - - -
- - -
-

We separate integer multiply and divide out from the base to simplify -low-end implementations, or for applications where integer multiply and -divide operations are either infrequent or better handled in attached -accelerators.

-
-
-
-
-

13.1. Multiplication Operations

-
-
-Diagram -
-
-
-

- -

-
-
-

MUL performs an XLEN-bit×XLEN-bit multiplication of -rs1 by rs2 and places the lower XLEN bits in the destination -register. MULH, MULHU, and MULHSU perform the same multiplication but -return the upper XLEN bits of the full 2×XLEN-bit -product, for signed×signed, -unsigned×unsigned, and rs1×unsigned rs2 multiplication, respectively. -If both the high and low bits of the same product are required, then the recommended code sequence is: MULH[[S]U] rdh, rs1, rs2; MUL rdl, rs1, rs2 (source register specifiers must be in same order and rdh cannot be the same as rs1 or rs2). Microarchitectures can then fuse these into a single multiply operation instead of performing two separate multiplies.

-
-
- - - - - -
- - -
-

MULHSU is used in multi-word signed multiplication to multiply the -most-significant word of the multiplicand (which contains the sign bit) -with the less-significant words of the multiplier (which are unsigned).

-
-
-
-
-

MULW is an RV64 instruction that multiplies the lower 32 bits of the -source registers, placing the sign extension of the lower 32 bits of the -result into the destination register.

-
-
- - - - - -
- - -
-

In RV64, MUL can be used to obtain the upper 32 bits of the 64-bit -product, but signed arguments must be proper 32-bit signed values, -whereas unsigned arguments must have their upper 32 bits clear. If the -arguments are not known to be sign- or zero-extended, an alternative is -to shift both arguments left by 32 bits, then use MULH[[S]U].

-
-
-
-
-
-

13.2. Division Operations

-
-
-Diagram -
-
-
-

-

-
-
-

DIV and DIVU perform an XLEN bits by XLEN bits signed and unsigned -integer division of rs1 by rs2, rounding towards zero. REM and REMU -provide the remainder of the corresponding division operation. For REM, -the sign of a nonzero result equals the sign of the dividend.

-
-
- - - - - -
- - -
-

For both signed and unsigned division, except in the case of overflow, it holds -that -stem 5737b426f42585386046d1ae40cf6e65.

-
-
-
-
-

If both the quotient and remainder are required from the same division, -the recommended code sequence is: DIV[U] rdq, rs1, rs2; REM[U] rdr, -rs1, rs2 (rdq cannot be the same as rs1 or rs2). -Microarchitectures can then fuse these into a single divide operation -instead of performing two separate divides.

-
-
-

DIVW and DIVUW are RV64 instructions that divide the lower 32 bits of -rs1 by the lower 32 bits of rs2, treating them as signed and -unsigned integers respectively, placing the 32-bit quotient in rd, -sign-extended to 64 bits. REMW and REMUW are RV64 instructions that -provide the corresponding signed and unsigned remainder operations -respectively. Both REMW and REMUW always sign-extend the 32-bit result -to 64 bits, including on a divide by zero. -

-
-
-

The semantics for division by zero and division overflow are summarized -in Table 8. The quotient of division by zero has all bits -set, and the remainder of division by zero equals the dividend. Signed -division overflow occurs only when the most-negative integer is divided -by stem 52c1acbf19a35be815dca1fd989d89e4. The quotient of a signed division with overflow is -equal to the dividend, and the remainder is zero. Unsigned division -overflow cannot occur.

-
- - --------- - - - - - - - - - - - - - - - - - - - - - - -
Table 8. Semantics for division by zero and division overflow. L is the width of the operation in bits: XLEN for DIV[U] and REM[U], or 32 for DIV[U]W and REM[U]W.
ConditionDividendDivisorDIVU[W]REMU[W]DIV[W]REM[W]

Division by zero
-Overflow (signed only)

stem 7073627e9999e583f5539cb4560a14d7
-stem 1f312cf5724cc9327753b6aa6e2b3c6c

0
-stem 52c1acbf19a35be815dca1fd989d89e4

stem 2211f7c99ec951c50baf5ffaf8b1b52d
- -

stem 7073627e9999e583f5539cb4560a14d7
- -

stem 52c1acbf19a35be815dca1fd989d89e4
- stem 1f312cf5724cc9327753b6aa6e2b3c6c

stem 7073627e9999e583f5539cb4560a14d7
- 0

-
- - - - - -
- - -
-

We considered raising exceptions on integer divide by zero, with these -exceptions causing a trap in most execution environments. However, this -would be the only arithmetic trap in the standard ISA (floating-point -exceptions set flags and write default values, but do not cause traps) -and would require language implementors to interact with the execution -environment’s trap handlers for this case. Further, where language -standards mandate that a divide-by-zero exception must cause an -immediate control flow change, only a single branch instruction needs to -be added to each divide operation, and this branch instruction can be -inserted after the divide and should normally be very predictably not -taken, adding little runtime overhead.

-
-
-

The value of all bits set is returned for both unsigned and signed -divide by zero to simplify the divider circuitry. The value of all 1s is -both the natural value to return for unsigned divide, representing the -largest unsigned number, and also the natural result for simple unsigned -divider implementations. Signed division is often implemented using an -unsigned division circuit and specifying the same overflow result -simplifies the hardware.

-
-
-
-
-
-

13.3. Zmmul Extension, Version 1.0

-
-

The Zmmul extension implements the multiplication subset of the M -extension. It adds all of the instructions defined in -Section 13.1, namely: MUL, MULH, MULHU, -MULHSU, and (for RV64 only) MULW. The encodings are identical to those -of the corresponding M-extension instructions. M implies Zmmul. -

-
-
- - - - - -
- - -
-

The Zmmul extension enables low-cost implementations that require -multiplication operations but not division. For many microcontroller -applications, division operations are too infrequent to justify the cost -of divider hardware. By contrast, multiplication operations are more -frequent, making the cost of multiplier hardware more justifiable. -Simple FPGA soft cores particularly benefit from eliminating division -but retaining multiplication, since many FPGAs provide hardwired -multipliers but require dividers be implemented in soft logic.

-
-
-
-
-
-
-
-

14. "A" Extension for Atomic Instructions, Version 2.1

-
-
-

CV32A65X: This extension is not supported.

-
-
-
-
-

15. "Zawrs" Extension for Wait-on-Reservation-Set instructions, Version 1.01

-
-
-

CV32A65X: This extension is not supported.

-
-
-
-
-

16. "Zacas" Extension for Atomic Compare-and-Swap (CAS) Instructions, Version 1.0.0

-
-
-

CV32A65X: This extension is not supported.

-
-
-
-
-

17. "Zabha" Extension for Byte and Halfword Atomic Memory Operations, Version 1.0.0

-
-
-

CV32A65X: This extension is not supported.

-
-
-
-
-

18. RVWMO Memory Consistency Model, Version 2.0

-
-
-

This chapter defines the RISC-V memory consistency model. A memory -consistency model is a set of rules specifying the values that can be -returned by loads of memory. RISC-V uses a memory model called "RVWMO" -(RISC-V Weak Memory Ordering) which is designed to provide flexibility -for architects to build high-performance scalable designs while -simultaneously supporting a tractable programming model. - -

-
-
-

Under RVWMO, code running on a single hart appears to execute in order -from the perspective of other memory instructions in the same hart, but -memory instructions from another hart may observe the memory -instructions from the first hart being executed in a different order. -Therefore, multithreaded code may require explicit synchronization to -guarantee ordering between memory instructions from different harts. The -base RISC-V ISA provides a FENCE instruction for this purpose, described -in Section 2.7, while the atomics extension "A" additionally defines load-reserved/store-conditional and atomic read-modify-write instructions. -

-
-
-

The standard ISA extension for total store ordering "Ztso" (Chapter 19) augments -RVWMO with additional rules specific to those extensions.

-
-
-

The appendices to this specification provide both axiomatic and -operational formalizations of the memory consistency model as well as -additional explanatory material. - -

-
-
- - - - - -
- - -
-

This chapter defines the memory model for regular main memory -operations. The interaction of the memory model with I/O memory, -instruction fetches, FENCE.I, page table walks, and SFENCE.VMA is not -(yet) formalized. Some or all of the above may be formalized in a future -revision of this specification. The RV128 base ISA and future ISA -extensions such as the V vector and J JIT extensions will need -to be incorporated into a future revision as well.

-
-
-

Memory consistency models supporting overlapping memory accesses of -different widths simultaneously remain an active area of academic -research and are not yet fully understood. The specifics of how memory -accesses of different sizes interact under RVWMO are specified to the -best of our current abilities, but they are subject to revision should -new issues be uncovered.

-
-
-
-
-

18.1. Definition of the RVWMO Memory Model

-
-

The RVWMO memory model is defined in terms of the global memory order, -a total ordering of the memory operations produced by all harts. In -general, a multithreaded program has many different possible executions, -with each execution having its own corresponding global memory order. -

-
-
-

The global memory order is defined over the primitive load and store -operations generated by memory instructions. It is then subject to the -constraints defined in the rest of this chapter. Any execution -satisfying all of the memory model constraints is a legal execution (as -far as the memory model is concerned).

-
-
-

18.1.1. Memory Model Primitives

-
-

The program order over memory operations reflects the order in which -the instructions that generate each load and store are logically laid -out in that hart’s dynamic instruction stream; i.e., the order in which -a simple in-order processor would execute the instructions of that hart.

-
-
-

Memory-accessing instructions give rise to memory operations. A memory -operation can be either a load operation, a store operation, or both -simultaneously. All memory operations are single-copy atomic: they can -never be observed in a partially complete state. -

-
-
-

Among instructions in RV32GC and RV64GC, each aligned memory instruction -gives rise to exactly one memory operation, with two exceptions. First, -an unsuccessful SC instruction does not give rise to any memory -operations. Second, FLD and FSD instructions may each give rise to -multiple memory operations if XLEN<64, as stated in -[fld_fsd] and clarified below. An aligned AMO -gives rise to a single memory operation that is both a load operation -and a store operation simultaneously.

-
-
- - - - - -
- - -
-

Instructions in the RV128 base instruction set and in future ISA -extensions such as V (vector) and P (SIMD) may give rise to multiple -memory operations. However, the memory model for these extensions has -not yet been formalized.

-
-
-
-
-

A misaligned load or store instruction may be decomposed into a set of -component memory operations of any granularity. An FLD or FSD -instruction for which XLEN<64 may also be decomposed into -a set of component memory operations of any granularity. The memory -operations generated by such instructions are not ordered with respect -to each other in program order, but they are ordered normally with -respect to the memory operations generated by preceding and subsequent -instructions in program order. -The atomics extension "A" does not require execution environments to support -misaligned atomic instructions at all. -However, if misaligned atomics are supported via the misaligned atomicity -granule PMA, then AMOs within an atomicity granule are not decomposed, nor are -loads and stores defined in the base ISAs, nor are loads and stores of no more -than XLEN bits defined in the F, D, and Q extensions. -

-
-
- - - - - -
- - -
-

The decomposition of misaligned memory operations down to byte -granularity facilitates emulation on implementations that do not -natively support misaligned accesses. Such implementations might, for -example, simply iterate over the bytes of a misaligned access one by -one.

-
-
-
-
-

An LR instruction and an SC instruction are said to be paired if the -LR precedes the SC in program order and if there are no other LR or SC -instructions in between; the corresponding memory operations are said to -be paired as well (except in case of a failed SC, where no store -operation is generated). The complete list of conditions determining -whether an SC must succeed, may succeed, or must fail is defined in -[sec:lrsc].

-
-
-

Load and store operations may also carry one or more ordering -annotations from the following set: "acquire-RCpc", "acquire-RCsc", -"release-RCpc", and "release-RCsc". An AMO or LR instruction with -aq set has an "acquire-RCsc" annotation. An AMO or SC instruction -with rl set has a "release-RCsc" annotation. An AMO, LR, or SC -instruction with both aq and rl set has both "acquire-RCsc" and -"release-RCsc" annotations.

-
-
-

For convenience, we use the term "acquire annotation" to refer to an -acquire-RCpc annotation or an acquire-RCsc annotation. Likewise, a -"release annotation" refers to a release-RCpc annotation or a -release-RCsc annotation. An "RCpc annotation" refers to an -acquire-RCpc annotation or a release-RCpc annotation. An RCsc -annotation refers to an acquire-RCsc annotation or a release-RCsc -annotation.

-
-
- - - - - -
- - -
-

In the memory model literature, the term "RCpc" stands for release -consistency with processor-consistent synchronization operations, and -the term "RCsc" stands for release consistency with sequentially -consistent synchronization operations.

-
-
-

While there are many different definitions for acquire and release -annotations in the literature, in the context of RVWMO these terms are -concisely and completely defined by Preserved Program Order rules 5-7.

-
-
-

"RCpc" annotations are currently only used when implicitly assigned to -every memory access per the standard extension "Ztso" -(Chapter 19). Furthermore, although the ISA does not -currently contain native load-acquire or store-release instructions, nor -RCpc variants thereof, the RVWMO model itself is designed to be -forwards-compatible with the potential addition of any or all of the -above into the ISA in a future extension.

-
-
-
-
-
-

18.1.2. Syntactic Dependencies

-
-

The definition of the RVWMO memory model depends in part on the notion -of a syntactic dependency, defined as follows.

-
-
-

In the context of defining dependencies, a register refers either to -an entire general-purpose register, some portion of a CSR, or an entire -CSR. The granularity at which dependencies are tracked through CSRs is -specific to each CSR and is defined in -Section 18.2.

-
-
-

Syntactic dependencies are defined in terms of instructions' source -registers, instructions' destination registers, and the way -instructions carry a dependency from their source registers to their -destination registers. This section provides a general definition of all -of these terms; however, Section 18.3 provides a -complete listing of the specifics for each instruction.

-
-
-

In general, a register r other than x0 is a source -register for an instruction i if any of the following -hold:

-
-
-
    -
  • -

    In the opcode of i, rs1, rs2, or rs3 is set to -r

    -
  • -
  • -

    i is a CSR instruction, and in the opcode of -i, csr is set to r, unless i -is CSRRW or CSRRWI and rd is set to x0

    -
  • -
  • -

    r is a CSR and an implicit source register for -i, as defined in Section 18.3

    -
  • -
  • -

    r is a CSR that aliases with another source register for -i

    -
  • -
-
-
-

Memory instructions also further specify which source registers are -address source registers and which are data source registers.

-
-
-

In general, a register r other than x0 is a destination -register for an instruction i if any of the following -hold:

-
-
-
    -
  • -

    In the opcode of i, rd is set to r

    -
  • -
  • -

    i is a CSR instruction, and in the opcode of -i, csr is set to r, unless i -is CSRRS or CSRRC and rs1 is set to x0 or i is CSRRSI -or CSRRCI and uimm[4:0] is set to zero.

    -
  • -
  • -

    r is a CSR and an implicit destination register for -i, as defined in Section 18.3

    -
  • -
  • -

    r is a CSR that aliases with another destination -register for i

    -
  • -
-
-
-

Most non-memory instructions carry a dependency from each of their -source registers to each of their destination registers. However, there -are exceptions to this rule; see Section 18.3.

-
-
-

Instruction j has a syntactic dependency on instruction -i via destination register s of -i and source register r of j -if either of the following hold:

-
-
-
    -
  • -

    s is the same as r, and no instruction -program-ordered between i and j has -r as a destination register

    -
  • -
  • -

    There is an instruction m program-ordered between -i and j such that all of the following hold:

    -
    -
      -
    1. -

      j has a syntactic dependency on m via -destination register q and source register r

      -
    2. -
    3. -

      m has a syntactic dependency on i via -destination register s and source register p

      -
    4. -
    5. -

      m carries a dependency from p to -q

      -
    6. -
    -
    -
  • -
-
-
-

Finally, in the definitions that follow, let a and -b be two memory operations, and let i and -j be the instructions that generate a and -b, respectively.

-
-
-

b has a syntactic address dependency on a -if r is an address source register for j and -j has a syntactic dependency on i via source -register r

-
-
-

b has a syntactic data dependency on a if -b is a store operation, r is a data source -register for j, and j has a syntactic -dependency on i via source register r

-
-
-

b has a syntactic control dependency on a -if there is an instruction m program-ordered between -i and j such that m is a -branch or indirect jump and m has a syntactic dependency -on i.

-
-
- - - - - -
- - -
-

Generally speaking, non-AMO load instructions do not have data source -registers, and unconditional non-AMO store instructions do not have -destination registers. However, a successful SC instruction is -considered to have the register specified in rd as a destination -register, and hence it is possible for an instruction to have a -syntactic dependency on a successful SC instruction that precedes it in -program order.

-
-
-
-
-
-

18.1.3. Preserved Program Order

-
-

The global memory order for any given execution of a program respects -some but not all of each hart’s program order. The subset of program -order that must be respected by the global memory order is known as -preserved program order.

-
-
-

The complete definition of preserved program order is as follows (and -note that AMOs are simultaneously both loads and stores): memory -operation a precedes memory operation b in -preserved program order (and hence also in the global memory order) if -a precedes b in program order, -a and b both access regular main memory -(rather than I/O regions), and any of the following hold:

-
-
-
    -
  • -

    Overlapping-Address Orderings:

    -
    -
      -
    1. -

      b is a store, and -a and b access overlapping memory addresses

      -
    2. -
    3. -

      a and b are loads, -x is a byte read by both a and -b, there is no store to x between -a and b in program order, and -a and b return values for x -written by different memory operations

      -
    4. -
    5. -

      a is -generated by an AMO or SC instruction, b is a load, and -b returns a value written by a

      -
    6. -
    -
    -
  • -
  • -

    Explicit Synchronization

    -
    -
      -
    1. -

      There is a FENCE instruction that -orders a before b

      -
    2. -
    3. -

      a has an acquire -annotation

      -
    4. -
    5. -

      b has a release annotation

      -
    6. -
    7. -

      a and b both have -RCsc annotations

      -
    8. -
    9. -

      a is paired with -b

      -
    10. -
    -
    -
  • -
  • -

    Syntactic Dependencies

    -
    -
      -
    1. -

      b has a syntactic address -dependency on a

      -
    2. -
    3. -

      b has a syntactic data -dependency on a

      -
    4. -
    5. -

      b is a store, and -b has a syntactic control dependency on a

      -
    6. -
    -
    -
  • -
  • -

    Pipeline Dependencies

    -
    -
      -
    1. -

      b is a -load, and there exists some store m between -a and b in program order such that -m has an address or data dependency on a, -and b returns a value written by m

      -
    2. -
    3. -

      b is a store, and -there exists some instruction m between a -and b in program order such that m has an -address dependency on a

      -
    4. -
    -
    -
  • -
-
-
-
-

18.1.4. Memory Model Axioms

-
-

An execution of a RISC-V program obeys the RVWMO memory consistency -model only if there exists a global memory order conforming to preserved -program order and satisfying the load value axiom, the atomicity -axiom, and the progress axiom.

-
-
-
Load Value Axiom
-
-

Each byte of each load i returns the value written to that -byte by the store that is the latest in global memory order among the -following stores:

-
-
-
    -
  1. -

    Stores that write that byte and that precede i in the -global memory order

    -
  2. -
  3. -

    Stores that write that byte and that precede i in -program order

    -
  4. -
-
-
-
-
Atomicity Axiom
-
-

If r and w are paired load and store -operations generated by aligned LR and SC instructions in a hart -h, s is a store to byte x, and -r returns a value written by s, then -s must precede w in the global memory order, -and there can be no store from a hart other than h to byte -x following s and preceding w -in the global memory order.

-
-
- - - - - -
- - -
-

The Atomicity Axiom theoretically supports LR/SC pairs of different widths and to -mismatched addresses, since implementations are permitted to allow SC -operations to succeed in such cases. However, in practice, we expect -such patterns to be rare, and their use is discouraged.

-
-
-
-
-
-
Progress Axiom
-
-

No memory operation may be preceded in the global memory order by an -infinite sequence of other memory operations.

-
-
-
-
-
-

18.2. CSR Dependency Tracking Granularity

- - ----- - - - - - - - - - - - - - - - - - - - - - - - - -
Table 9. Granularities at which syntactic dependencies are tracked through CSRs
NamePortions Tracked as Independent UnitsAliases

fflags

Bits 4, 3, 2, 1, 0

fcsr

frm

entire CSR

fcsr

fcsr

Bits 7-5, 4, 3, 2, 1, 0

fflags, frm

-
-

Note: read-only CSRs are not listed, as they do not participate in the -definition of syntactic dependencies.

-
-
-
-

18.3. Source and Destination Register Listings

-
-

This section provides a concrete listing of the source and destination -registers for each instruction. These listings are used in the -definition of syntactic dependencies in -Section 18.1.2.

-
-
-

The term "accumulating CSR" is used to describe a CSR that is both a -source and a destination register, but which carries a dependency only -from itself to itself.

-
-
-

Instructions carry a dependency from each source register in the -"Source Registers" column to each destination register in the -"Destination Registers" column, from each source register in the -"Source Registers" column to each CSR in the "Accumulating CSRs" -column, and from each CSR in the "Accumulating CSRs" column to itself, -except where annotated otherwise.

-
-
-

Key:

-
-
-
    -
  • -

    AAddress source register

    -
  • -
  • -

    DData source register

    -
  • -
  • -

    † The instruction does not carry a dependency from -any source register to any destination register

    -
  • -
  • -

    ‡ The instruction carries dependencies from source -register(s) to destination register(s) as specified

    -
  • -
-
- - ------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 10. RV32I Base Integer Instruction Set
Source RegistersDestination RegistersAccumulating CSRs

LUI

rd

AUIPC

rd

JAL

rd

JALR†

rs1

rd

BEQ

rs1, rs2

BNE

rs1, rs2

BLT

rs1, rs2

BGE

rs1, rs2

BLTU

rs1, rs2

BGEU

rs1, rs2

LB †

rs1 A

rd

LH †

rs1 A

rd

LW †

rs1 A

rd

LBU †

rs1 A

rd

LHU †

rs1 A

rd

SB

rs1 A, rs2 D

SH

rs1 A, rs2 D

SW

rs1 A, rs2 D

ADDI

rs1

rd

SLTI

rs1

rd

SLTIU

rs1

rd

XORI

rs1

rd

ORI

rs1

rd

ANDI

rs1

rd

SLLI

rs1

rd

SRLI

rs1

rd

SRAI

rs1

rd

ADD

rs1, rs2

rd

SUB

rs1, rs2

rd

SLL

rs1, rs2

rd

SLT

rs1, rs2

rd

SLTU

rs1, rs2

rd

XOR

rs1, rs2

rd

SRL

rs1, rs2

rd

SRA

rs1, rs2

rd

OR

rs1, rs2

rd

AND

rs1, rs2

rd

FENCE

FENCE.I

ECALL

EBREAK

CSRRW‡

rs1, csr*

rd, csr

*unless rd=x0

CSRRS‡

rs1, csr

rd *, csr

*unless rs1=x0

CSRRC‡

rs1, csr

rd *, csr

*unless rs1=x0

‡ carries a dependency from rs1 to csr and from csr to rd

CSRRWI ‡

csr *

rd, csr

*unless rd=x0

CSRRSI ‡

csr

rd, csr*

*unless uimm[4:0]=0

CSRRCI ‡

csr

rd, csr*

*unless uimm[4:0]=0

‡ carries a dependency from csr to rd

- - ------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 11. RV64I Base Integer Instruction Set
Source RegistersDestination RegistersAccumulating CSRs

LWU

rs1 A

rd

LD

rs1 A

rd

SD

rs1 A, rs2 D

SLLI

rs1

rd

SRLI

rs1

rd

SRAI

rs1

rd

ADDIW

rs1

rd

SLLIW

rs1

rd

SRLIW

rs1

rd

SRAIW

rs1

rd

ADDW

rs1, rs2

rd

SUBW

rs1, rs2

rd

SLLW

rs1, rs2

rd

SRLW

rs1, rs2

rd

SRAW

rs1, rs2

rd

- - ------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 12. RV32M Standard Extension
Source RegistersDestination RegistersAccumulating CSRs

MUL

rs1, rs2

rd

MULH

rs1, rs2

rd

MULHSU

rs1, rs2

rd

MULHU

rs1, rs2

rd

DIV

rs1, rs2

rd

DIVU

rs1, rs2

rd

REM

rs1, rs2

rd

REMU

rs1, rs2

rd

- - ------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 13. RV64M Standard Extension
Source RegistersDestination RegistersAccumulating CSRs

MULW

rs1, rs2

rd

DIVW

rs1, rs2

rd

DIVUW

rs1, rs2

rd

REMW

rs1, rs2

rd

REMUW

rs1, rs2

rd

- - ------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 14. RV32A Standard Extension
Source RegistersDestination RegistersAccumulating CSRs

LR.W†

rs1 A

rd

SC.W†

rs1 A, rs2 D

rd *

* if successful

AMOSWAP.W†

rs1 A, rs2 D

rd

AMOADD.W†

rs1 A, rs2 D

rd

AMOXOR.W†

rs1 A, rs2 D

rd

AMOAND.W†

rs1 A, rs2 D

rd

AMOOR.W†

rs1 A, rs2D

rd

AMOMIN.W†

rs1 A, rs2 D

rd

AMOMAX.W†

rs1 A, rs2 D

rd

AMOMINU.W†

rs1 A, rs2 D

rd

AMOMAXU.W†

rs1 A, rs2 D

rd

- - ------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 15. RV64A Standard Extension
Source RegistersDestination RegistersAccumulating CSRs

LR.D†

rs1 A

rd

SC.D†

rs1 A, rs2 D

rd *

*if successful

AMOSWAP.D†

rs1 A, rs2 D

rd

AMOADD.D†

rs1 A, rs2 D

rd

AMOXOR.D†

rs1 A, rs2 D

rd

AMOAND.D†

rs1 A, rs2D

rd

AMOOR.D†

rs1 A, rs2D

rd

AMOMIN.D†

rs1 A, rs2D

rd

AMOMAX.D†

rs1 A, rs2D

rd

AMOMINU.D†

rs1 A, rs2D

rd

AMOMAXU.D†

rs1 A, rs2D

rd

- - ------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 16. RV32F Standard Extension
Source RegistersDestination RegistersAccumulating CSRs

FLW†

rs1 A

rd

FSW

rs1 A, rs2D

FMADD.S

rs1, rs2, rs3, frm*

rd

NV, OF, UF, NX

*if rm=111

FMSUB.S

rs1, rs2, rs3, frm*

rd

NV, OF, UF, NX

*if rm=111

FNMSUB.S

rs1, rs2, rs3, frm*

rd

NV, OF, UF, NX

*if rm=111

FNMADD.S

rs1, rs2, rs3, frm*

rd

NV, OF, UF, NX

*if rm=111

FADD.S

rs1, rs2, frm*

rd

NV, OF, NX

*if rm=111

FSUB.S

rs1, rs2, frm*

rd

NV, OF, NX

*if rm=111

FMUL.S

rs1, rs2, frm*

rd

NV, OF, UF, NX

*if rm=111

FDIV.S

rs1, rs2, frm*

rd

NV, DZ, OF, UF, NX

*if rm=111

FSQRT.S

rs1, frm*

rd

NV, NX

*if rm=111

FSGNJ.S

rs1, rs2

rd

FSGNJN.S

rs1, rs2

rd

FSGNJX.S

rs1, rs2

rd

FMIN.S

rs1, rs2

rd

NV

FMAX.S

rs1, rs2

rd

NV

FCVT.W.S

rs1, frm*

rd

NV, NX

*if rm=111

FCVT.WU.S

rs1, frm*

rd

NV, NX

*if rm=111

FMV.X.W

rs1

rd

FEQ.S

rs1, rs2

rd

NV

FLT.S

rs1, rs2

rd

NV

FLE.S

rs1, rs2

rd

NV

FCLASS.S

rs1

rd

FCVT.S.W

rs1, frm*

rd

NX

*if rm=111

FCVT.S.WU

rs1, frm*

rd

NX

*if rm=111

FMV.W.X

rs1

rd

- - ------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 17. RV64F Standard Extension
Source RegistersDestination RegistersAccumulating CSRs

FCVT.L.S

rs1, frm*

rd

NV, NX

*if rm=111

FCVT.LU.S

rs1, frm*

rd

NV, NX

*if rm=111

FCVT.S.L

rs1, frm*

rd

NX

*if rm=111

FCVT.S.LU

rs1, frm*

rd

NX

*if rm=111

- - ------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 18. RV32D Standard Extension
Source RegistersDestination RegistersAccumulating CSRs

FLD†

rs1 A

rd

FSD

rs1 A, rs2D

FMADD.D

rs1, rs2, rs3, frm*

rd

NV, OF, UF, NX

*if rm=111

FMSUB.D

rs1, rs2, rs3, frm*

rd

NV, OF, UF, NX

*if rm=111

FNMSUB.D

rs1, rs2, rs3, frm*

rd

NV, OF, UF, NX

*if rm=111

FNMADD.D

rs1, rs2, rs3, frm*

rd

NV, OF, UF, NX

*if rm=111

FADD.D

rs1, rs2, frm*

rd

NV, OF, NX

*if rm=111

FSUB.D

rs1, rs2, frm*

rd

NV, OF, NX

*if rm=111

FMUL.D

rs1, rs2, frm*

rd

NV, OF, UF, NX

*if rm=111

FDIV.D

rs1, rs2, frm*

rd

NV, DZ, OF, UF, NX

*if rm=111

FSQRT.D

rs1, frm*

rd

NV, NX

*if rm=111

FSGNJ.D

rs1, rs2

rd

FSGNJN.D

rs1, rs2

rd

FSGNJX.D

rs1, rs2

rd

FMIN.D

rs1, rs2

rd

NV

FMAX.D

rs1, rs2

rd

NV

FCVT.S.D

rs1, frm*

rd

NV, OF, UF, NX

*if rm=111

FCVT.D.S

rs1

rd

NV

FEQ.D

rs1, rs2

rd

NV

FLT.D

rs1, rs2

rd

NV

FLE.D

rs1, rs2

rd

NV

FCLASS.D

rs1

rd

FCVT.W.D

rs1,*

rd

NV, NX

*if rm=111

FCVT.WU.D

rs1, frm*

rd

NV, NX

*if rm=111

FCVT.D.W

rs1

rd

FCVT.D.WU

rs1

rd

- - ------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 19. RV64D Standard Extension
Source RegistersDestination RegistersAccumulating CSRs

FCVT.L.D

rs1, frm*

rd

NV, NX

*if rm=111

FCVT.LU.D

rs1, frm*

rd

NV, NX

*if rm=111

FMV.X.D

rs1

rd

FCVT.D.L

rs1, frm*

rd

NX

*if rm=111

FCVT.D.LU

rs1, frm*

rd

NX

*if rm=111

FMV.D.X

rs1

rd

-
-
-
-
-

19. "Ztso" Extension for Total Store Ordering, Version 1.0

-
-
-

CV32A65X: This extension is not supported.

-
-
-
-
-

20. "CMO" Extensions for Base Cache Management Operation ISA, Version 1.0.0

-
-
-

CV32A65X: These extensions are not supported.

-
-
-
-
-

21. "F" Extension for Single-Precision Floating-Point, Version 2.2

-
-
-

CV32A65X: This extension is not supported.

-
-
-
-
-

22. "D" Extension for Double-Precision Floating-Point, Version 2.2

-
-
-

CV32A65X: This extension is not supported.

-
-
-
-
-

23. "Q" Extension for Quad-Precision Floating-Point, Version 2.2

-
-
-

CV32A65X: This extension is not supported.

-
-
-
-
-

24. "Zfh" and "Zfhmin" Extensions for Half-Precision Floating-Point, Version 1.0

-
-
-

CV32A65X: These extensions are not supported.

-
-
-
-
-

25. "BF16" Extensions for for BFloat16-precision Floating-Point, Version 1.0

-
-
-

CV32A65X: These extensions are not supported.

-
-
-
-
-

26. "Zfa" Extension for Additional Floating-Point Instructions, Version 1.0

-
-
-

CV32A65X: This extension is not supported.

-
-
-
-
-

27. "Zfinx", "Zdinx", "Zhinx", "Zhinxmin" Extensions for Floating-Point in Integer Registers, Version 1.0

-
-
-

CV32A65X: These extensions are not supported.

-
-
-
-
-

28. "C" Extension for Compressed Instructions, Version 2.0

-
-
-

This chapter describes the RISC-V standard compressed instruction-set -extension, named "C", which reduces static and dynamic code size by -adding short 16-bit instruction encodings for common operations. The C -extension can be added to any of the base ISAs (RV32, RV64, RV128), and -we use the generic term "RVC" to cover any of these. Typically, -50%-60% of the RISC-V instructions in a program can be replaced with RVC -instructions, resulting in a 25%-30% code-size reduction.

-
-
-

28.1. Overview

-
-

RVC uses a simple compression scheme that offers shorter 16-bit versions -of common 32-bit RISC-V instructions when:

-
-
-
    -
  • -

    the immediate or address offset is small, or

    -
  • -
  • -

    one of the registers is the zero register (x0), the ABI link register -(x1), or the ABI stack pointer (x2), or

    -
  • -
  • -

    the destination register and the first source register are identical, or

    -
  • -
  • -

    the registers used are the 8 most popular ones.

    -
  • -
-
-
-

The C extension is compatible with all other standard instruction -extensions. The C extension allows 16-bit instructions to be freely -intermixed with 32-bit instructions, with the latter now able to start -on any 16-bit boundary, i.e., IALIGN=16. With the addition of the C -extension, no instructions can raise instruction-address-misaligned -exceptions.

-
-
- - - - - -
- - -
-

Removing the 32-bit alignment constraint on the original 32-bit -instructions allows significantly greater code density.

-
-
-
-
-

The compressed instruction encodings are mostly common across RV32C, -RV64C, and RV128C, but as shown in Table 34, a few opcodes are used for -different purposes depending on base ISA. For example, the wider -address-space RV64C and RV128C variants require additional opcodes to -compress loads and stores of 64-bit integer values, while RV32C uses the -same opcodes to compress loads and stores of single-precision -floating-point values. Similarly, RV128C requires additional opcodes to -capture loads and stores of 128-bit integer values, while these same -opcodes are used for loads and stores of double-precision floating-point -values in RV32C and RV64C. If the C extension is implemented, the -appropriate compressed floating-point load and store instructions must -be provided whenever the relevant standard floating-point extension (F -and/or D) is also implemented. In addition, RV32C includes a compressed -jump and link instruction to compress short-range subroutine calls, -where the same opcode is used to compress ADDIW for RV64C and RV128C.

-
-
- - - - - -
- - -
-

Double-precision loads and stores are a significant fraction of static -and dynamic instructions, hence the motivation to include them in the -RV32C and RV64C encoding.

-
-
-

Although single-precision loads and stores are not a significant source -of static or dynamic compression for benchmarks compiled for the -currently supported ABIs, for microcontrollers that only provide -hardware single-precision floating-point units and have an ABI that only -supports single-precision floating-point numbers, the single-precision -loads and stores will be used at least as frequently as double-precision -loads and stores in the measured benchmarks. Hence, the motivation to -provide compressed support for these in RV32C.

-
-
-

Short-range subroutine calls are more likely in small binaries for -microcontrollers, hence the motivation to include these in RV32C.

-
-
-

Although reusing opcodes for different purposes for different base ISAs -adds some complexity to documentation, the impact on implementation -complexity is small even for designs that support multiple base ISAs. -The compressed floating-point load and store variants use the same -instruction format with the same register specifiers as the wider -integer loads and stores.

-
-
-
-
-

RVC was designed under the constraint that each RVC instruction expands -into a single 32-bit instruction in either the base ISA (RV32I/E, RV64I/E, -or RV128I) or the F and D standard extensions where present. Adopting -this constraint has two main benefits:

-
-
-
    -
  • -

    Hardware designs can simply expand RVC instructions during decode, -simplifying verification and minimizing modifications to existing -microarchitectures.

    -
  • -
  • -

    Compilers can be unaware of the RVC extension and leave code compression -to the assembler and linker, although a compression-aware compiler will -generally be able to produce better results.

    -
  • -
-
-
- - - - - -
- - -
-

We felt the multiple complexity reductions of a simple one-one mapping -between C and base IFD instructions far outweighed the potential gains -of a slightly denser encoding that added additional instructions only -supported in the C extension, or that allowed encoding of multiple IFD -instructions in one C instruction.

-
-
-
-
-

It is important to note that the C extension is not designed to be a -stand-alone ISA, and is meant to be used alongside a base ISA.

-
-
- - - - - -
- - -
-

Variable-length instruction sets have long been used to improve code -density. For example, the IBM Stretch (Buchholz, 1962), developed in the late 1950s, had -an ISA with 32-bit and 64-bit instructions, where some of the 32-bit -instructions were compressed versions of the full 64-bit instructions. -Stretch also employed the concept of limiting the set of registers that -were addressable in some of the shorter instruction formats, with short -branch instructions that could only refer to one of the index registers. -The later IBM 360 architecture (Amdahl et al., 1964) supported a simple variable-length -instruction encoding with 16-bit, 32-bit, or 48-bit instruction formats.

-
-
-

In 1963, CDC introduced the Cray-designed CDC 6600 (Thornton, 1965), a precursor to RISC -architectures, that introduced a register-rich load-store architecture -with instructions of two lengths, 15-bits and 30-bits. The later Cray-1 -design used a very similar instruction format, with 16-bit and 32-bit -instruction lengths.

-
-
-

The initial RISC ISAs from the 1980s all picked performance over code -size, which was reasonable for a workstation environment, but not for -embedded systems. Hence, both ARM and MIPS subsequently made versions of -the ISAs that offered smaller code size by offering an alternative -16-bit wide instruction set instead of the standard 32-bit wide -instructions. The compressed RISC ISAs reduced code size relative to -their starting points by about 25-30%, yielding code that was -significantly smaller than 80x86. This result surprised some, as their -intuition was that the variable-length CISC ISA should be smaller than -RISC ISAs that offered only 16-bit and 32-bit formats.

-
-
-

Since the original RISC ISAs did not leave sufficient opcode space free -to include these unplanned compressed instructions, they were instead -developed as complete new ISAs. This meant compilers needed different -code generators for the separate compressed ISAs. The first compressed -RISC ISA extensions (e.g., ARM Thumb and MIPS16) used only a fixed -16-bit instruction size, which gave good reductions in static code size -but caused an increase in dynamic instruction count, which led to lower -performance compared to the original fixed-width 32-bit instruction -size. This led to the development of a second generation of compressed -RISC ISA designs with mixed 16-bit and 32-bit instruction lengths (e.g., -ARM Thumb2, microMIPS, PowerPC VLE), so that performance was similar to -pure 32-bit instructions but with significant code size savings. -Unfortunately, these different generations of compressed ISAs are -incompatible with each other and with the original uncompressed ISA, -leading to significant complexity in documentation, implementations, and -software tools support.

-
-
-

Of the commonly used 64-bit ISAs, only PowerPC and microMIPS currently -supports a compressed instruction format. It is surprising that the most -popular 64-bit ISA for mobile platforms (ARM v8) does not include a -compressed instruction format given that static code size and dynamic -instruction fetch bandwidth are important metrics. Although static code -size is not a major concern in larger systems, instruction fetch -bandwidth can be a major bottleneck in servers running commercial -workloads, which often have a large instruction working set.

-
-
-

Benefiting from 25 years of hindsight, RISC-V was designed to support -compressed instructions from the outset, leaving enough opcode space for -RVC to be added as a simple extension on top of the base ISA (along with -many other extensions). The philosophy of RVC is to reduce code size for -embedded applications and to improve performance and energy-efficiency -for all applications due to fewer misses in the instruction cache. -Waterman shows that RVC fetches 25%-30% fewer instruction bits, which -reduces instruction cache misses by 20%-25%, or roughly the same -performance impact as doubling the instruction cache size. (Waterman, 2011)

-
-
-
-
-
-

28.2. Compressed Instruction Formats

-
-

-
-
-

Table 20 shows the nine compressed instruction -formats. CR, CI, and CSS can use any of the 32 RVI registers, but CIW, -CL, CS, CA, and CB are limited to just 8 of them. -Table 21 lists these popular registers, which -correspond to registers x8 to x15. Note that there is a separate -version of load and store instructions that use the stack pointer as the -base address register, since saving to and restoring from the stack are -so prevalent, and that they use the CI and CSS formats to allow access -to all 32 data registers. CIW supplies an 8-bit immediate for the -ADDI4SPN instruction.

-
-
- - - - - -
- - -
-

The RISC-V ABI was changed to make the frequently used registers map to -registers 'x8-x15'. This simplifies the decompression decoder by -having a contiguous naturally aligned set of register numbers, and is -also compatible with the RV32E and RV64E base ISAs, which only have 16 integer -registers.

-
-
-
-
-

Compressed register-based floating-point loads and stores also use the -CL and CS formats respectively, with the eight registers mapping to f8 to f15. -

-
-
- - - - - -
- - -
-

The standard RISC-V calling convention maps the most frequently used -floating-point registers to registers f8 to f15, which allows the -same register decompression decoding as for integer register numbers.

-
-
-
-
-

-The formats were designed to keep bits for the two register source -specifiers in the same place in all instructions, while the destination -register field can move. When the full 5-bit destination register -specifier is present, it is in the same place as in the 32-bit RISC-V -encoding. Where immediates are sign-extended, the sign extension is -always from bit 12. Immediate fields have been scrambled, as in the base -specification, to reduce the number of immediate muxes required.

-
-
- - - - - -
- - -
-

The immediate fields are scrambled in the instruction formats instead of -in sequential order so that as many bits as possible are in the same -position in every instruction, thereby simplifying implementations.

-
-
-
-
-

For many RVC instructions, zero-valued immediates are disallowed and -x0 is not a valid 5-bit register specifier. These restrictions free up -encoding space for other instructions requiring fewer operand bits.

-
- - ---- - - - - - - -
Table 20. Compressed 16-bit RVC instruction formats
---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Format

Meaning

CR

Register

CI

Immediate

CSS

Stack-relative Store

CIW

Wide Immediate

CL

Load

CS

Store

CA

Arithmetic

CB

Branch/Arithmetic

CJ

Jump

--------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

15 14 13 12

11 10 9 8 7

6 5 4 3 2

1 0

funct4

rd/rs1

rs2

op

funct3

imm

rd/rs1

imm

op

funct3

imm

rs2

op

funct3

imm

rd′

op

funct3

imm

rs1′

imm

rd′

op

funct3

imm

rs1′

imm

rs2′

op

funct6

rd′/rs1′

funct2

rs2′

op

funct3

offset

rd′/rs1′

offset

op

funct3

jump target

op

- - ---- - - - - - - -
Table 21. Registers specified by the three-bit rs1′, rs2′, and rd′ fields of the CIW, CL, CS, CA, and CB formats.
--- - - - - - - - - - - - - - - - - - -

RVC Register Number

Integer Register Number

Integer Register ABI Name

Floating-Point Register Number

Floating-Point Register ABI Name

---------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

000

001

010

011

100

101

110

111

x8

x9

x10

x11

x12

x13

x14

x15

s0

s1

a0

a1

a2

a3

a4

a5

f8

f9

f10

f11

f12

f13

f14

f15

fs0

fs1

fa0

fa1

fa2

fa3

fa4

fa5

-
-
-

28.3. Load and Store Instructions

-
-

To increase the reach of 16-bit instructions, data-transfer instructions -use zero-extended immediates that are scaled by the size of the data in -bytes: ×4 for words, ×8 for double -words, and ×16 for quad words.

-
-
-

RVC provides two variants of loads and stores. One uses the ABI stack -pointer, x2, as the base address and can target any data register. The -other can reference one of 8 base address registers and one of 8 data -registers.

-
-
-

28.3.1. Stack-Pointer-Based Loads and Stores

-
-
-Diagram -
-
-
-

These instructions use the CI format.

-
-
-

C.LWSP loads a 32-bit value from memory into register rd. It computes -an effective address by adding the zero-extended offset, scaled by 4, -to the stack pointer, x2. It expands to lw rd, offset(x2). C.LWSP is -only valid when rd≠x0 the code points with rd=x0 are reserved.

-
-
-

C.LDSP is an RV64C/RV128C-only instruction that loads a 64-bit value -from memory into register rd. It computes its effective address by -adding the zero-extended offset, scaled by 8, to the stack pointer, -x2. It expands to ld rd, offset(x2). C.LDSP is only valid when -rd≠x0 the code points with -rd=x0 are reserved.

-
-
-

C.LQSP is an RV128C-only instruction that loads a 128-bit value from -memory into register rd. It computes its effective address by adding -the zero-extended offset, scaled by 16, to the stack pointer, x2. It -expands to lq rd, offset(x2). C.LQSP is only valid when -rd≠x0 the code points with -rd=x0 are reserved.

-
-
-

C.FLWSP is an RV32FC-only instruction that loads a single-precision -floating-point value from memory into floating-point register rd. It -computes its effective address by adding the zero-extended offset, -scaled by 4, to the stack pointer, x2. It expands to -flw rd, offset(x2).

-
-
-

C.FLDSP is an RV32DC/RV64DC-only instruction that loads a -double-precision floating-point value from memory into floating-point -register rd. It computes its effective address by adding the -zero-extended offset, scaled by 8, to the stack pointer, x2. It -expands to fld rd, offset(x2).

-
-
-
-Diagram -
-
-
-

These instructions use the CSS format.

-
-
-

C.SWSP stores a 32-bit value in register rs2 to memory. It computes an -effective address by adding the zero-extended offset, scaled by 4, to -the stack pointer, x2. It expands to sw rs2, offset(x2).

-
-
-

C.SDSP is an RV64C/RV128C-only instruction that stores a 64-bit value in -register rs2 to memory. It computes an effective address by adding the -zero-extended offset, scaled by 8, to the stack pointer, x2. It -expands to sd rs2, offset(x2).

-
-
-

C.SQSP is an RV128C-only instruction that stores a 128-bit value in -register rs2 to memory. It computes an effective address by adding the -zero-extended offset, scaled by 16, to the stack pointer, x2. It -expands to sq rs2, offset(x2).

-
-
-

C.FSWSP is an RV32FC-only instruction that stores a single-precision -floating-point value in floating-point register rs2 to memory. It -computes an effective address by adding the zero-extended offset, -scaled by 4, to the stack pointer, x2. It expands to -fsw rs2, offset(x2).

-
-
-

C.FSDSP is an RV32DC/RV64DC-only instruction that stores a -double-precision floating-point value in floating-point register rs2 -to memory. It computes an effective address by adding the -zero-extended offset, scaled by 8, to the stack pointer, x2. It -expands to fsd rs2, offset(x2).

-
-
- - - - - -
- - -
-

Register save/restore code at function entry/exit represents a -significant portion of static code size. The stack-pointer-based -compressed loads and stores in RVC are effective at reducing the -save/restore static code size by a factor of 2 while improving -performance by reducing dynamic instruction bandwidth.

-
-
-

A common mechanism used in other ISAs to further reduce save/restore -code size is load-multiple and store-multiple instructions. We -considered adopting these for RISC-V but noted the following drawbacks -to these instructions:

-
-
-
    -
  • -

    These instructions complicate processor implementations.

    -
  • -
  • -

    For virtual memory systems, some data accesses could be resident in -physical memory and some could not, which requires a new restart -mechanism for partially executed instructions.

    -
  • -
  • -

    Unlike the rest of the RVC instructions, there is no IFD equivalent to -Load Multiple and Store Multiple.

    -
  • -
  • -

    Unlike the rest of the RVC instructions, the compiler would have to be -aware of these instructions to both generate the instructions and to -allocate registers in an order to maximize the chances of the them being -saved and stored, since they would be saved and restored in sequential -order.

    -
  • -
  • -

    Simple microarchitectural implementations will constrain how other -instructions can be scheduled around the load and store multiple -instructions, leading to a potential performance loss.

    -
  • -
  • -

    The desire for sequential register allocation might conflict with the -featured registers selected for the CIW, CL, CS, CA, and CB formats.

    -
  • -
-
-
-

Furthermore, much of the gains can be realized in software by replacing -prologue and epilogue code with subroutine calls to common prologue and -epilogue code, a technique described in Section 5.6 of (Waterman, 2016).

-
-
-

While reasonable architects might come to different conclusions, we -decided to omit load and store multiple and instead use the -software-only approach of calling save/restore millicode routines to -attain the greatest code size reduction.

-
-
-
-
-
-

28.3.2. Register-Based Loads and Stores

-
-
-Diagram -
-
-
-

-These instructions use the CL format.

-
-
-

C.LW loads a 32-bit value from memory into register -rd′. It computes an effective address by adding the -zero-extended offset, scaled by 4, to the base address in register -rs1′. It expands to lw rd′, offset(rs1′).

-
-
-

C.LD is an RV64C/RV128C-only instruction that loads a 64-bit value from -memory into register rd′. It computes an effective -address by adding the zero-extended offset, scaled by 8, to the base -address in register rs1′. It expands to -ld rd′, offset(rs1′).

-
-
-

C.LQ is an RV128C-only instruction that loads a 128-bit value from -memory into register rd′. It computes an effective -address by adding the zero-extended offset, scaled by 16, to the base -address in register rs1′. It expands to -lq rd′, offset(rs1′).

-
-
-

C.FLW is an RV32FC-only instruction that loads a single-precision -floating-point value from memory into floating-point register -rd′. It computes an effective address by adding the -zero-extended offset, scaled by 4, to the base address in register -rs1′. It expands to -flw rd′, offset(rs1′).

-
-
-

C.FLD is an RV32DC/RV64DC-only instruction that loads a double-precision -floating-point value from memory into floating-point register -rd′. It computes an effective address by adding the -zero-extended offset, scaled by 8, to the base address in register -rs1′. It expands to -fld rd′, offset(rs1′).

-
-
-
-Diagram -
-
-
-

-
-
-

These instructions use the CS format.

-
-
-

C.SW stores a 32-bit value in register rs2′ to memory. -It computes an effective address by adding the zero-extended offset, -scaled by 4, to the base address in register rs1′. It -expands to sw rs2′, offset(rs1′).

-
-
-

C.SD is an RV64C/RV128C-only instruction that stores a 64-bit value in -register rs2′ to memory. It computes an effective -address by adding the zero-extended offset, scaled by 8, to the base -address in register rs1′. It expands to -sd rs2′, offset(rs1′).

-
-
-

C.SQ is an RV128C-only instruction that stores a 128-bit value in -register rs2′ to memory. It computes an effective -address by adding the zero-extended offset, scaled by 16, to the base -address in register rs1′. It expands to -sq rs2′, offset(rs1′).

-
-
-

C.FSW is an RV32FC-only instruction that stores a single-precision -floating-point value in floating-point register rs2′ to -memory. It computes an effective address by adding the zero-extended -offset, scaled by 4, to the base address in register -rs1′. It expands to -fsw rs2′, offset(rs1′).

-
-
-

C.FSD is an RV32DC/RV64DC-only instruction that stores a -double-precision floating-point value in floating-point register -rs2′ to memory. It computes an effective address by -adding the zero-extended offset, scaled by 8, to the base address in -register rs1′. It expands to -fsd rs2′, offset(rs1′).

-
-
-
-
-

28.4. Control Transfer Instructions

-
-

RVC provides unconditional jump instructions and conditional branch -instructions. As with base RVI instructions, the offsets of all RVC -control transfer instructions are in multiples of 2 bytes.

-
-
-
-Diagram -
-
-
-

-
-
-

These instructions use the CJ format.

-
-
-

C.J performs an unconditional control transfer. The offset is -sign-extended and added to the pc to form the jump target address. C.J -can therefore target a ±2 KiB range. C.J expands to -jal x0, offset.

-
-
-

C.JAL is an RV32C-only instruction that performs the same operation as -C.J, but additionally writes the address of the instruction following -the jump (pc+2) to the link register, x1. C.JAL expands to -jal x1, offset.

-
-
-
-Diagram -
-
-
-

-
-
-

These instructions use the CR format.

-
-
-

C.JR (jump register) performs an unconditional control transfer to the -address in register rs1. C.JR expands to jalr x0, 0(rs1). C.JR is -only valid when stem 7911d76fb533bd8710d50a065c53f749; the code -point with stem 89ec63a1940db41477e8738c3a1f7b05 is reserved.

-
-
-

C.JALR (jump and link register) performs the same operation as C.JR, but -additionally writes the address of the instruction following the jump -(pc+2) to the link register, x1. C.JALR expands to -jalr x1, 0(rs1). C.JALR is only valid when -stem 7911d76fb533bd8710d50a065c53f749; the code point with -stem 89ec63a1940db41477e8738c3a1f7b05 corresponds to the C.EBREAK -instruction.

-
-
- - - - - -
- - -
-

Strictly speaking, C.JALR does not expand exactly to a base RVI -instruction as the value added to the PC to form the link address is 2 -rather than 4 as in the base ISA, but supporting both offsets of 2 and 4 -bytes is only a very minor change to the base microarchitecture.

-
-
-
-
-
-Diagram -
-
-
-

-
-
-

These instructions use the CB format.

-
-
-

C.BEQZ performs conditional control transfers. The offset is -sign-extended and added to the pc to form the branch target address. -It can therefore target a ±256 B range. C.BEQZ takes the -branch if the value in register rs1′ is zero. It -expands to beq rs1′, x0, offset.

-
-
-

C.BNEZ is defined analogously, but it takes the branch if -rs1′ contains a nonzero value. It expands to -bne rs1′, x0, offset.

-
-
-
-

28.5. Integer Computational Instructions

-
-

RVC provides several instructions for integer arithmetic and constant -generation.

-
-
-

28.5.1. Integer Constant-Generation Instructions

-
-

The two constant-generation instructions both use the CI instruction -format and can target any integer register.

-
-
-
-Diagram -
-
-
-

-
-
-

C.LI loads the sign-extended 6-bit immediate, imm, into register rd. -C.LI expands into addi rd, x0, imm. C.LI is only valid when -rd≠x0; the code points with rd=x0 encode HINTs.

-
-
-

C.LUI loads the non-zero 6-bit immediate field into bits 17–12 of the -destination register, clears the bottom 12 bits, and sign-extends bit 17 -into all higher bits of the destination. C.LUI expands into -lui rd, imm. C.LUI is only valid when -stem 966f16482ee6eca6c3e071e1950d4dd4, -and when the immediate is not equal to zero. The code points with -imm=0 are reserved; the remaining code points with rd=x0 are -HINTs; and the remaining code points with rd=x2 correspond to the -C.ADDI16SP instruction.

-
-
-
-

28.5.2. Integer Register-Immediate Operations

-
-

These integer register-immediate operations are encoded in the CI format -and perform operations on an integer register and a 6-bit immediate.

-
-
-
-Diagram -
-
-
-

-
-
-

C.ADDI adds the non-zero sign-extended 6-bit immediate to the value in -register rd then writes the result to rd. C.ADDI expands into -addi rd, rd, imm. C.ADDI is only valid when -rd≠x0 and imm≠0. The code -points with rd=x0 encode the C.NOP instruction; the remaining code -points with imm=0 encode HINTs.

-
-
-

C.ADDIW is an RV64C/RV128C-only instruction that performs the same -computation but produces a 32-bit result, then sign-extends result to 64 -bits. C.ADDIW expands into addiw rd, rd, imm. The immediate can be -zero for C.ADDIW, where this corresponds to sext.w rd. C.ADDIW is -only valid when rd≠x0; the code points with -rd=x0 are reserved.

-
-
-

C.ADDI16SP shares the opcode with C.LUI, but has a destination field of -x2. C.ADDI16SP adds the non-zero sign-extended 6-bit immediate to the -value in the stack pointer (sp=x2), where the immediate is scaled to -represent multiples of 16 in the range (-512,496). C.ADDI16SP is used to -adjust the stack pointer in procedure prologues and epilogues. It -expands into addi x2, x2, nzimm[9:4]. C.ADDI16SP is only valid when -nzimm≠0; the code point with nzimm=0 is reserved.

-
-
- - - - - -
- - -
-

In the standard RISC-V calling convention, the stack pointer sp is -always 16-byte aligned.

-
-
-
-
-
-Diagram -
-
-
-

-C.ADDI4SPN is a CIW-format instruction that adds a zero-extended -non-zero immediate, scaled by 4, to the stack pointer, x2, and writes -the result to rd′. This instruction is used to generate -pointers to stack-allocated variables, and expands to -addi rd′, x2, nzuimm[9:2]. C.ADDI4SPN is only valid when -nzuimm≠0; the code points with nzuimm=0 are -reserved.

-
-
-
-Diagram -
-
-
-

-
-
-

C.SLLI is a CI-format instruction that performs a logical left shift of -the value in register rd then writes the result to rd. The shift -amount is encoded in the shamt field. For RV128C, a shift amount of -zero is used to encode a shift of 64. C.SLLI expands into -slli rd, rd, shamt[5:0], except for RV128C with shamt=0, which expands to -slli rd, rd, 64.

-
-
-

For RV32C, shamt[5] must be zero; the code points with shamt[5]=1 -are designated for custom extensions. For RV32C and RV64C, the shift -amount must be non-zero; the code points with shamt=0 are HINTs. For -all base ISAs, the code points with rd=x0 are HINTs, except those -with shamt[5]=1 in RV32C.

-
-
-
-Diagram -
-
-
-

-
-
-

C.SRLI is a CB-format instruction that performs a logical right shift of -the value in register rd′ then writes the result to -rd′. The shift amount is encoded in the shamt field. -For RV128C, a shift amount of zero is used to encode a shift of 64. -Furthermore, the shift amount is sign-extended for RV128C, and so the -legal shift amounts are 1-31, 64, and 96-127. C.SRLI expands into -srli rd′, rd′, shamt, except for -RV128C with shamt=0, which expands to -srli rd′, rd′, 64.

-
-
-

For RV32C, shamt[5] must be zero; the code points with shamt[5]=1 -are designated for custom extensions. For RV32C and RV64C, the shift -amount must be non-zero; the code points with shamt=0 are HINTs.

-
-
-

C.SRAI is defined analogously to C.SRLI, but instead performs an -arithmetic right shift. C.SRAI expands to -srai rd′, rd′, shamt.

-
-
- - - - - -
- - -
-

Left shifts are usually more frequent than right shifts, as left shifts -are frequently used to scale address values. Right shifts have therefore -been granted less encoding space and are placed in an encoding quadrant -where all other immediates are sign-extended. For RV128, the decision -was made to have the 6-bit shift-amount immediate also be sign-extended. -Apart from reducing the decode complexity, we believe right-shift -amounts of 96-127 will be more useful than 64-95, to allow extraction of -tags located in the high portions of 128-bit address pointers. We note -that RV128C will not be frozen at the same point as RV32C and RV64C, to -allow evaluation of typical usage of 128-bit address-space codes.

-
-
-
-
-
-Diagram -
-
-
-

-
-
-

C.ANDI is a CB-format instruction that computes the bitwise AND of the -value in register rd′ and the sign-extended 6-bit -immediate, then writes the result to rd′. C.ANDI -expands to andi rd′, rd′, imm.

-
-
-
-

28.5.3. Integer Register-Register Operations

-
-
-Diagram -
-
-
-

-These instructions use the CR format.

-
-
-

C.MV copies the value in register rs2 into register rd. C.MV expands -into add rd, x0, rs2. C.MV is only valid when -rs2≠x0 the code points with rs2=x0 correspond to the C.JR instruction. The code points with rs2≠x0 and rd=x0 are HINTs.

-
-
- - - - - -
- - -
-

C.MV expands to a different instruction than the canonical MV -pseudoinstruction, which instead uses ADDI. Implementations that handle -MV specially, e.g. using register-renaming hardware, may find it more -convenient to expand C.MV to MV instead of ADD, at slight additional -hardware cost.

-
-
-
-
-

C.ADD adds the values in registers rd and rs2 and writes the result -to register rd. C.ADD expands into add rd, rd, rs2. C.ADD is only -valid when rs2≠x0 the code points with rs2=x0 correspond to the C.JALR -and C.EBREAK instructions. The code points with rs2≠x0 and rd=x0 are HINTs.

-
-
-
-Diagram -
-
-
-

-
-
-

These instructions use the CA format.

-
-
-

C.AND computes the bitwise AND of the values in registers -rd′ and rs2′, then writes the result -to register rd′. C.AND expands into -and rd′, rd′, rs2′.

-
-
-

C.OR computes the bitwise OR of the values in registers -rd′ and rs2′, then writes the result -to register rd′. C.OR expands into -or rd′, rd′, rs2′.

-
-
-

C.XOR computes the bitwise XOR of the values in registers -rd′ and rs2′, then writes the result -to register rd′. C.XOR expands into -xor rd′, rd′, rs2′.

-
-
-

C.SUB subtracts the value in register rs2′ from the -value in register rd′, then writes the result to -register rd′. C.SUB expands into -sub rd′, rd′, rs2′.

-
-
-

C.ADDW is an RV64C/RV128C-only instruction that adds the values in -registers rd′ and rs2′, then -sign-extends the lower 32 bits of the sum before writing the result to -register rd′. C.ADDW expands into -addw rd′, rd′, rs2′.

-
-
-

C.SUBW is an RV64C/RV128C-only instruction that subtracts the value in -register rs2′ from the value in register -rd′, then sign-extends the lower 32 bits of the -difference before writing the result to register rd′. -C.SUBW expands into subw rd′, rd′, rs2′.

-
-
- - - - - -
- - -
-

This group of six instructions do not provide large savings -individually, but do not occupy much encoding space and are -straightforward to implement, and as a group provide a worthwhile -improvement in static and dynamic compression.

-
-
-
-
-
-

28.5.4. Defined Illegal Instruction

-
-
-Diagram -
-
-
-

-
-
-

A 16-bit instruction with all bits zero is permanently reserved as an -illegal instruction.

-
-
- - - - - -
- - -
-

We reserve all-zero instructions to be illegal instructions to help trap -attempts to execute zero-ed or non-existent portions of the memory -space. The all-zero value should not be redefined in any non-standard -extension. Similarly, we reserve instructions with all bits set to 1 -(corresponding to very long instructions in the RISC-V variable-length -encoding scheme) as illegal to capture another common value seen in -non-existent memory regions.

-
-
-
-
-
-

28.5.5. NOP Instruction

-
-
-Diagram -
-
-
-

-
-
-

C.NOP is a CI-format instruction that does not change any user-visible -state, except for advancing the pc and incrementing any applicable -performance counters. C.NOP expands to nop. C.NOP is only valid when -imm=0; the code points with imm≠0 encode HINTs.

-
-
-
-

28.5.6. Breakpoint Instruction

-
-
-Diagram -
-
-
-

-
-
-

Debuggers can use the C.EBREAK instruction, which expands to ebreak, -to cause control to be transferred back to the debugging environment. -C.EBREAK shares the opcode with the C.ADD instruction, but with rd and -rs2 both zero, thus can also use the CR format.

-
-
-
-
-

28.6. Usage of C Instructions in LR/SC Sequences

-
-

On implementations that support the C extension, compressed forms of the -I instructions permitted inside constrained LR/SC sequences, as -described in [sec:lrscseq], are also permitted -inside constrained LR/SC sequences.

-
-
- - - - - -
- - -
-

The implication is that any implementation that claims to support both -the A and C extensions must ensure that LR/SC sequences containing valid -C instructions will eventually complete.

-
-
-
-
-
-

28.7. HINT Instructions

-
-

A portion of the RVC encoding space is reserved for microarchitectural -HINTs. Like the HINTs in the RV32I base ISA (see -HINT Instructions), these instructions do not -modify any architectural state, except for advancing the pc and any -applicable performance counters. HINTs are executed as no-ops on -implementations that ignore them.

-
-
-

RVC HINTs are encoded as computational instructions that do not modify -the architectural state, either because rd=x0 (e.g. -C.ADD x0, t0), or because rd is overwritten with a copy of itself -(e.g. C.ADDI t0, 0).

-
-
- - - - - -
- - -
-

This HINT encoding has been chosen so that simple implementations can -ignore HINTs altogether, and instead execute a HINT as a regular -computational instruction that happens not to mutate the architectural -state.

-
-
-
-
-

RVC HINTs do not necessarily expand to their RVI HINT counterparts. For -example, C.ADD x0, a0 might not encode the same HINT as -ADD x0, x0, a0.

-
-
- - - - - -
- - -
-

The primary reason to not require an RVC HINT to expand to an RVI HINT -is that HINTs are unlikely to be compressible in the same manner as the -underlying computational instruction. Also, decoupling the RVC and RVI -HINT mappings allows the scarce RVC HINT space to be allocated to the -most popular HINTs, and in particular, to HINTs that are amenable to -macro-op fusion.

-
-
-
-
-

Table 32 lists all RVC HINT code points. For RV32C, 78% -of the HINT space is reserved for standard HINTs. The remainder of the HINT space is designated for custom HINTs; -no standard HINTs will ever be defined in this subspace.

-
- - ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 22. RVC HINT instructions.
InstructionConstraintsCode PointsPurpose

C.NOP

imm≠0

63

Designated for future standard use

C.ADDI

rdx0, imm=0

31

C.LI

rd=x0

64

C.LUI

rd=x0, imm≠0

63

C.MV

rd=x0, rs2x0

31

C.ADD

rd=x0, rs2x0, rs2x2-x5

27

C.ADD

rd=x0, rs2x2-x5

4

(rs2=x2) C.NTL.P1 (rs2=x3) C.NTL.PALL (rs2=x4) C.NTL.S1 (rs2=x5) C.NTL.ALL

C.SLLI

rd=x0, imm≠0

31 (RV32), 63 (RV64/128)

Designated for custom use

C.SLLI64

rd=x0

1

C.SLLI64

rdx0, RV32 and RV64 only

31

C.SRLI64

RV32 and RV64 only

8

C.SRAI64

RV32 and RV64 only

8

-
-
-

28.8. RVC Instruction Set Listings

-
-

Table 23 shows a map of the major -opcodes for RVC. Each row of the table corresponds to one quadrant of -the encoding space. The last quadrant, which has the two -least-significant bits set, corresponds to instructions wider than 16 -bits, including those in the base ISAs. Several instructions are only -valid for certain operands; when invalid, they are marked either RES -to indicate that the opcode is reserved for future standard extensions; -Custom to indicate that the opcode is designated for custom -extensions; or HINT to indicate that the opcode is reserved for -microarchitectural hints (see Section 18.7).

-
-
- - ------------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 23. RVC opcode map instructions.

inst[15:13]
-inst[1:0]

000

001

010

011

100

101

110

111

00

ADDI4SPN

FLD
-FLD
-LQ

LW

FLW
-LD
-LD

Reserved

FSD
-FSD
-SQ

SW

FSW
-SD
-SD

RV32
-RV64
-RV128

01

ADDI

JAL
-ADDIW
-ADDIW

LI

LUI/ADDI16SP

MISC-ALU

J

BEQZ

BNEZ

RV32
-RV64
-RV128

10

SLLI

FLDSP
-FLDSP
-LQSP

LWSP

FLWSP
-LDSP
-LDSP

J[AL]R/MV/ADD

FSDSP
-FSDSP
-SQSP

SWSP

FSWSP
-SDSP
-SDSP

RV32
-RV64
-RV128

11

>16b

-
-

Figure 2, Figure 3, and Figure 4 list the RVC instructions.

-
-
-
-Diagram -
-
Figure 2. Instruction listing for RVC, Quadrant 0
-
-
-
-Diagram -
-
Figure 3. Instruction listing for RVC, Quadrant 1
-
-
-
-Diagram -
-
Figure 4. Instruction listing for RVC, Quadrant 2
-
-
-
-
-
-

29. "Zc*" Extension for Code Size Reduction, Version 1.0.0

-
-
-

29.1. Zc* Overview

-
-

Zc* is a group of extensions that define subsets of the existing C extension (Zca, Zcd, Zcf) and new extensions which only contain 16-bit encodings.

-
-
-

Zcm* all reuse the encodings for c.fld, c.fsd, c.fldsp, c.fsdsp.

-
- - --------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 24. Zc* extension overview
InstructionZcaZcfZcdZcbZcmpZcmt

The Zca extension is added as way to refer to instructions in the C extension that do not include the floating-point loads and stores

C excl. c.f*

yes

The Zcf extension is added as a way to refer to compressed single-precision floating-point load/stores

c.flw

rv32

c.flwsp

rv32

c.fsw

rv32

c.fswsp

rv32

The Zcd extension is added as a way to refer to compressed double-precision floating-point load/stores

c.fld

yes

c.fldsp

yes

c.fsd

yes

c.fsdsp

yes

Simple operations for use on all architectures

c.lbu

yes

c.lh

yes

c.lhu

yes

c.sb

yes

c.sh

yes

c.zext.b

yes

c.sext.b

yes

c.zext.h

yes

c.sext.h

yes

c.zext.w

yes

c.mul

yes

c.not

yes

PUSH/POP and double move which overlap with c.fsdsp. Complex operations intended for embedded CPUs

cm.push

yes

cm.pop

yes

cm.popret

yes

cm.popretz

yes

cm.mva01s

yes

cm.mvsa01

yes

Table jump which overlaps with c.fsdsp. Complex operations intended for embedded CPUs

cm.jt

yes

cm.jalt

yes

-
-
-

29.2. C

-
-

The C extension is the superset of the following extensions:

-
-
-
    -
  • -

    Zca

    -
  • -
  • -

    Zcf if F is specified (RV32 only)

    -
  • -
  • -

    Zcd if D is specified

    -
  • -
-
-
-

As C defines the same instructions as Zca, Zcf and Zcd, the rule is that:

-
-
-
    -
  • -

    C always implies Zca

    -
  • -
  • -

    C+F implies Zcf (RV32 only)

    -
  • -
  • -

    C+D implies Zcd

    -
  • -
-
-
-
-

29.3. Zce

-
-

The Zce extension is intended to be used for microcontrollers, and includes all relevant Zc extensions.

-
-
-
    -
  • -

    Specifying Zce on RV32 without F includes Zca, Zcb, Zcmp, Zcmt

    -
  • -
  • -

    Specifying Zce on RV32 with F includes Zca, Zcb, Zcmp, Zcmt and Zcf

    -
  • -
  • -

    Specifying Zce on RV64 always includes Zca, Zcb, Zcmp, Zcmt

    -
    -
      -
    • -

      Zcf doesn’t exist for RV64

      -
    • -
    -
    -
  • -
-
-
-

Therefore common ISA strings can be updated as follows to include the relevant Zc extensions, for example:

-
-
-
    -
  • -

    RV32IMC becomes RV32IM_Zce

    -
  • -
  • -

    RV32IMCF becomes RV32IMF_Zce

    -
  • -
-
-
-
-

29.4. MISA.C

-
-

MISA.C is set if the following extensions are selected:

-
-
-
    -
  • -

    Zca and not F

    -
  • -
  • -

    Zca, Zcf and F is specified (RV32 only)

    -
  • -
  • -

    Zca, Zcf and Zcd if D is specified (RV32 only)

    -
    -
      -
    • -

      this configuration excludes Zcmp, Zcmt

      -
    • -
    -
    -
  • -
  • -

    Zca, Zcd if D is specified (RV64 only)

    -
    -
      -
    • -

      this configuration excludes Zcmp, Zcmt

      -
    • -
    -
    -
  • -
-
-
-
-

29.5. Zca

-
-

The Zca extension is added as way to refer to instructions in the C extension that do not include the floating-point loads and stores.

-
-
-

Therefore it excluded all 16-bit floating point loads and stores: c.flw, c.flwsp, c.fsw, c.fswsp, c.fld, c.fldsp, c.fsd, c.fsdsp.

-
-
- - - - - -
- - -
-

the C extension only includes F/D instructions when D and F are also specified

-
-
-
-
-
-

29.6. Zcf (RV32 only)

-
-

Zcf is the existing set of compressed single precision floating point loads and stores: c.flw, c.flwsp, c.fsw, c.fswsp.

-
-
-

Zcf is only relevant to RV32, it cannot be specified for RV64.

-
-
-

The Zcf extension depends on the Zca and F extensions.

-
-
-
-

29.7. Zcd

-
-

Zcd is the existing set of compressed double precision floating point loads and stores: c.fld, c.fldsp, c.fsd, c.fsdsp.

-
-
-

The Zcd extension depends on the Zca and D extensions.

-
-
-
-

29.8. Zcb

-
-

Zcb has simple code-size saving instructions which are easy to implement on all CPUs.

-
-
-

All encodings are currently reserved for all architectures, and have no conflicts with any existing extensions.

-
-
- - - - - -
- - -Zcb can be implemented on any CPU as the instructions are 16-bit versions of existing 32-bit instructions from the application class profile. -
-
-
-

The Zcb extension depends on the Zca extension.

-
-
-

As shown on the individual instruction pages, many of the instructions in Zcb depend upon another extension being implemented. For example, c.mul is only implemented if M or Zmmul is implemented, and c.sext.b is only implemented if Zbb is implemented.

-
-
-

The c.mul encoding uses the CA register format along with other instructions such as c.sub, c.xor etc.

-
-
- - - - - -
- - - c.sext.w is a pseudoinstruction for c.addiw rd, 0 (RV64) -
-
- ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
RV32RV64MnemonicInstruction

yes

yes

c.lbu rd', uimm(rs1')

Load unsigned byte, 16-bit encoding

yes

yes

c.lhu rd', uimm(rs1')

Load unsigned halfword, 16-bit encoding

yes

yes

c.lh rd', uimm(rs1')

Load signed halfword, 16-bit encoding

yes

yes

c.sb rs2', uimm(rs1')

Store byte, 16-bit encoding

yes

yes

c.sh rs2', uimm(rs1')

Store halfword, 16-bit encoding

yes

yes

c.zext.b rsd'

Zero extend byte, 16-bit encoding

yes

yes

c.sext.b rsd'

Sign extend byte, 16-bit encoding

yes

yes

c.zext.h rsd'

Zero extend halfword, 16-bit encoding

yes

yes

c.sext.h rsd'

Sign extend halfword, 16-bit encoding

yes

c.zext.w rsd'

Zero extend word, 16-bit encoding

yes

yes

c.not rsd'

Bitwise not, 16-bit encoding

yes

yes

c.mul rsd', rs2'

Multiply, 16-bit encoding

-
-
-
-

29.9. Zcmp

-
-

The Zcmp extension is a set of instructions which may be executed as a series of existing 32-bit RISC-V instructions.

-
-
-

This extension reuses some encodings from c.fsdsp. Therefore it is incompatible with Zcd, - which is included when C and D extensions are both present.

-
-
- - - - - -
- - -Zcmp is primarily targeted at embedded class CPUs due to implementation complexity. Additionally, it is not compatible with architecture class profiles. -
-
-
-

The Zcmp extension depends on the Zca extension.

-
-
-

The PUSH/POP assembly syntax uses several variables, the meaning of which are:

-
-
-
    -
  • -

    reg_list is a list containing 1 to 13 registers (ra and 0 to 12 s registers)

    -
    -
      -
    • -

      valid values: {ra}, {ra, s0}, {ra, s0-s1}, {ra, s0-s2}, …​, {ra, s0-s8}, {ra, s0-s9}, {ra, s0-s11}

      -
    • -
    • -

      note that {ra, s0-s10} is not valid, giving 12 lists not 13 for better encoding

      -
    • -
    -
    -
  • -
  • -

    stack_adj is the total size of the stack frame.

    -
    -
      -
    • -

      valid values vary with register list length and the specific encoding, see the instruction pages for details.

      -
    • -
    -
    -
  • -
-
- ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
RV32RV64MnemonicInstruction

yes

yes

cm.push {reg_list}, -stack_adj

cm.push

yes

yes

cm.pop {reg_list}, stack_adj

cm.pop

yes

yes

cm.popret {reg_list}, stack_adj

cm.popret

yes

yes

cm.popretz {reg_list}, stack_adj

cm.popretz

yes

yes

cm.mva01s rs1', rs2'

Move two s0-s7 registers into a0-a1

yes

yes

cm.mvsa01 r1s', r2s'

Move a0-a1 into two different s0-s7 registers

-
-
-
-

29.10. Zcmt

-
-

Zcmt adds the table jump instructions and also adds the jvt CSR. The jvt CSR requires a -state enable if Smstateen is implemented. See jvt CSR, table jump base vector and control register for details.

-
-
-

This extension reuses some encodings from c.fsdsp. Therefore it is incompatible with Zcd, - which is included when C and D extensions are both present.

-
-
- - - - - -
- - -Zcmt is primarily targeted at embedded class CPUs due to implementation complexity. Additionally, it is not compatible with RVA profiles. -
-
-
-

The Zcmt extension depends on the Zca and Zicsr extensions.

-
- ------ - - - - - - - - - - - - - - - - - - - - - - -
RV32RV64MnemonicInstruction

yes

yes

cm.jt index

Jump via table

yes

yes

cm.jalt index

Jump and link via table

-
-
-

29.11. Zc instruction formats

-
-

Several instructions in this specification use the following new instruction formats.

-
- --------------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Formatinstructions15:109876543210

CLB

c.lbu

funct6

rs1'

uimm

rd'

op

CSB

c.sb

funct6

rs1'

uimm

rs2'

op

CLH

c.lhu, c.lh

funct6

rs1'

funct1

uimm

rd'

op

CSH

c.sh

funct6

rs1'

funct1

uimm

rs2'

op

CU

c.[sz]ext.*, c.not

funct6

rd'/rs1'

funct5

op

CMMV

cm.mvsa01 cm.mva01s

funct6

r1s'

funct2

r2s'

op

CMJT

cm.jt cm.jalt

funct6

index

op

CMPP

cm.push*, cm.pop*

funct6

funct2

urlist

spimm

op

-
- - - - - -
- - -
-

c.mul uses the existing CA format.

-
-
-
-
-
-
-

29.12. Zcb instructions

-
-

29.12.1. c.lbu

-
-

Synopsis:

-
-
-

Load unsigned byte, 16-bit encoding

-
-
-

Mnemonic:

-
-
-

c.lbu rd', uimm(rs1')

-
-
-

Encoding (RV32, RV64):

-
-
-
-Diagram -
-
-
-

The immediate offset is formed as follows:

-
-
-
-
  uimm[31:2] = 0;
-  uimm[1]    = encoding[5];
-  uimm[0]    = encoding[6];
-
-
-
-

Description:

-
-
-

This instruction loads a byte from the memory address formed by adding rs1' to the zero extended immediate uimm. The resulting byte is zero extended to XLEN bits and is written to rd'.

-
-
- - - - - -
- - -
-

rd' and rs1' are from the standard 8-register set x8-x15.

-
-
-
-
-

Prerequisites:

-
-
-

None

-
-
-

Operation:

-
-
-
-
//This is not SAIL, it's pseudocode. The SAIL hasn't been written yet.
-
-X(rdc) = EXTZ(mem[X(rs1c)+EXTZ(uimm)][7..0]);
-
-
-
-
-
-

29.12.2. c.lhu

-
-

Synopsis:

-
-
-

Load unsigned halfword, 16-bit encoding

-
-
-

Mnemonic:

-
-
-

c.lhu rd', uimm(rs1')

-
-
-

Encoding (RV32, RV64):

-
-
-
-Diagram -
-
-
-

The immediate offset is formed as follows:

-
-
-
-
  uimm[31:2] = 0;
-  uimm[1]    = encoding[5];
-  uimm[0]    = 0;
-
-
-
-

Description:

-
-
-

This instruction loads a halfword from the memory address formed by adding rs1' to the zero extended immediate uimm. The resulting halfword is zero extended to XLEN bits and is written to rd'.

-
-
- - - - - -
- - -
-

rd' and rs1' are from the standard 8-register set x8-x15.

-
-
-
-
-

Prerequisites:

-
-
-

None

-
-
-

Operation:

-
-
-
-
//This is not SAIL, it's pseudocode. The SAIL hasn't been written yet.
-
-X(rdc) = EXTZ(load_mem[X(rs1c)+EXTZ(uimm)][15..0]);
-
-
-
-
-
-

29.12.3. c.lh

-
-

Synopsis:

-
-
-

Load signed halfword, 16-bit encoding

-
-
-

Mnemonic:

-
-
-

c.lh rd', uimm(rs1')

-
-
-

Encoding (RV32, RV64):

-
-
-
-Diagram -
-
-
-

The immediate offset is formed as follows:

-
-
-
-
  uimm[31:2] = 0;
-  uimm[1]    = encoding[5];
-  uimm[0]    = 0;
-
-
-
-

Description:

-
-
-

This instruction loads a halfword from the memory address formed by adding rs1' to the zero extended immediate uimm. The resulting halfword is sign extended to XLEN bits and is written to rd'.

-
-
- - - - - -
- - -
-

rd' and rs1' are from the standard 8-register set x8-x15.

-
-
-
-
-

Prerequisites:

-
-
-

None

-
-
-

Operation:

-
-
-
-
//This is not SAIL, it's pseudocode. The SAIL hasn't been written yet.
-
-X(rdc) = EXTS(load_mem[X(rs1c)+EXTZ(uimm)][15..0]);
-
-
-
-
-
-

29.12.4. c.sb

-
-

Synopsis:

-
-
-

Store byte, 16-bit encoding

-
-
-

Mnemonic:

-
-
-

c.sb rs2', uimm(rs1')

-
-
-

Encoding (RV32, RV64):

-
-
-
-Diagram -
-
-
-

The immediate offset is formed as follows:

-
-
-
-
  uimm[31:2] = 0;
-  uimm[1]    = encoding[5];
-  uimm[0]    = encoding[6];
-
-
-
-

Description:

-
-
-

This instruction stores the least significant byte of rs2' to the memory address formed by adding rs1' to the zero extended immediate uimm.

-
-
- - - - - -
- - -
-

rs1' and rs2' are from the standard 8-register set x8-x15.

-
-
-
-
-

Prerequisites:

-
-
-

None

-
-
-

Operation:

-
-
-
-
//This is not SAIL, it's pseudocode. The SAIL hasn't been written yet.
-
-mem[X(rs1c)+EXTZ(uimm)][7..0] = X(rs2c)
-
-
-
-
-
-

29.12.5. c.sh

-
-

Synopsis:

-
-
-

Store halfword, 16-bit encoding

-
-
-

Mnemonic:

-
-
-

c.sh rs2', uimm(rs1')

-
-
-

Encoding (RV32, RV64):

-
-
-
-Diagram -
-
-
-

The immediate offset is formed as follows:

-
-
-
-
  uimm[31:2] = 0;
-  uimm[1]    = encoding[5];
-  uimm[0]    = 0;
-
-
-
-

Description:

-
-
-

This instruction stores the least significant halfword of rs2' to the memory address formed by adding rs1' to the zero extended immediate uimm.

-
-
- - - - - -
- - -
-

rs1' and rs2' are from the standard 8-register set x8-x15.

-
-
-
-
-

Prerequisites:

-
-
-

None

-
-
-

Operation:

-
-
-
-
//This is not SAIL, it's pseudocode. The SAIL hasn't been written yet.
-
-mem[X(rs1c)+EXTZ(uimm)][15..0] = X(rs2c)
-
-
-
-
-
-

29.12.6. c.zext.b

-
-

Synopsis:

-
-
-

Zero extend byte, 16-bit encoding

-
-
-

Mnemonic:

-
-
-

c.zext.b rd'/rs1'

-
-
-

Encoding (RV32, RV64):

-
-
-
-Diagram -
-
-
-

Description:

-
-
-

This instruction takes a single source/destination operand. -It zero-extends the least-significant byte of the operand to XLEN bits by inserting zeros into all of -the bits more significant than 7.

-
-
- - - - - -
- - -
-

rd'/rs1' is from the standard 8-register set x8-x15.

-
-
-
-
-

Prerequisites:

-
-
-

None

-
-
-

32-bit equivalent:

-
-
-
-
andi rd'/rs1', rd'/rs1', 0xff
-
-
-
- - - - - -
- - -
-

The SAIL module variable for rd'/rs1' is called rsdc.

-
-
-
-
-

Operation:

-
-
-
-
X(rsdc) = EXTZ(X(rsdc)[7..0]);
-
-
-
-
-
-

29.12.7. c.sext.b

-
-

Synopsis:

-
-
-

Sign extend byte, 16-bit encoding

-
-
-

Mnemonic:

-
-
-

c.sext.b rd'/rs1'

-
-
-

Encoding (RV32, RV64):

-
-
-
-Diagram -
-
-
-

Description:

-
-
-

This instruction takes a single source/destination operand. -It sign-extends the least-significant byte in the operand to XLEN bits by copying the most-significant bit -in the byte (i.e., bit 7) to all of the more-significant bits.

-
-
- - - - - -
- - -
-

rd'/rs1' is from the standard 8-register set x8-x15.

-
-
-
-
-

Prerequisites:

-
-
-

Zbb is also required.

-
-
- - - - - -
- - -The SAIL module variable for rd'/rs1' is called rsdc. -
-
-
-

Operation:

-
-
-
-
X(rsdc) = EXTS(X(rsdc)[7..0]);
-
-
-
-
-
-

29.12.8. c.zext.h

-
-

Synopsis:

-
-
-

Zero extend halfword, 16-bit encoding

-
-
-

Mnemonic:

-
-
-

c.zext.h rd'/rs1'

-
-
-

Encoding (RV32, RV64):

-
-
-
-Diagram -
-
-
-

Description:

-
-
-

This instruction takes a single source/destination operand. -It zero-extends the least-significant halfword of the operand to XLEN bits by inserting zeros into all of -the bits more significant than 15.

-
-
- - - - - -
- - -
-

rd'/rs1' is from the standard 8-register set x8-x15.

-
-
-
-
-

Prerequisites:

-
-
-

Zbb is also required.

-
-
- - - - - -
- - -
-

The SAIL module variable for rd'/rs1' is called rsdc.

-
-
-
-
-

Operation:

-
-
-
-
X(rsdc) = EXTZ(X(rsdc)[15..0]);
-
-
-
-
-
-

29.12.9. c.sext.h

-
-

Synopsis:

-
-
-

Sign extend halfword, 16-bit encoding

-
-
-

Mnemonic:

-
-
-

c.sext.h rd'/rs1'

-
-
-

Encoding (RV32, RV64):

-
-
-
-Diagram -
-
-
-

Description:

-
-
-

This instruction takes a single source/destination operand. -It sign-extends the least-significant halfword in the operand to XLEN bits by copying the most-significant bit -in the halfword (i.e., bit 15) to all of the more-significant bits.

-
-
- - - - - -
- - -
-

rd'/rs1' is from the standard 8-register set x8-x15.

-
-
-
-
-

Prerequisites:

-
-
-

Zbb is also required.

-
-
- - - - - -
- - -
-

The SAIL module variable for rd'/rs1' is called rsdc.

-
-
-
-
-

Operation:

-
-
-
-
X(rsdc) = EXTS(X(rsdc)[15..0]);
-
-
-
-
-
-

29.12.10. c.zext.w

-
-

Synopsis:

-
-
-

Zero extend word, 16-bit encoding

-
-
-

Mnemonic:

-
-
-

c.zext.w rd'/rs1'

-
-
-

Encoding (RV64):

-
-
-
-Diagram -
-
-
-

Description:

-
-
-

This instruction takes a single source/destination operand. -It zero-extends the least-significant word of the operand to XLEN bits by inserting zeros into all of -the bits more significant than 31.

-
-
- - - - - -
- - -
-

rd'/rs1' is from the standard 8-register set x8-x15.

-
-
-
-
-

Prerequisites:

-
-
-

Zba is also required.

-
-
-

32-bit equivalent:

-
-
-
-
add.uw rd'/rs1', rd'/rs1', zero
-
-
-
- - - - - -
- - -
-

The SAIL module variable for rd'/rs1' is called rsdc.

-
-
-
-
-

Operation:

-
-
-
-
X(rsdc) = EXTZ(X(rsdc)[31..0]);
-
-
-
-
-
-

29.12.11. c.not

-
-

Synopsis:

-
-
-

Bitwise not, 16-bit encoding

-
-
-

Mnemonic:

-
-
-

c.not rd'/rs1'

-
-
-

Encoding (RV32, RV64):

-
-
-
-Diagram -
-
-
-

Description:

-
-
-

This instruction takes the one’s complement of rd'/rs1' and writes the result to the same register.

-
-
- - - - - -
- - -
-

rd'/rs1' is from the standard 8-register set x8-x15.

-
-
-
-
-

Prerequisites:

-
-
-

None

-
-
-

32-bit equivalent:

-
-
-
-
xori rd'/rs1', rd'/rs1', -1
-
-
-
- - - - - -
- - -
-

The SAIL module variable for rd'/rs1' is called rsdc.

-
-
-
-
-

Operation:

-
-
-
-
X(rsdc) = X(rsdc) XOR -1;
-
-
-
-
-
-

29.12.12. c.mul

-
-

Synopsis:

-
-
-

Multiply, 16-bit encoding

-
-
-

Mnemonic:

-
-
-

c.mul rsd', rs2'

-
-
-

Encoding (RV32, RV64):

-
-
-
-Diagram -
-
-
-

Description:

-
-
-

This instruction multiplies XLEN bits of the source operands from rsd' and rs2' and writes the lowest XLEN bits of the result to rsd'.

-
-
- - - - - -
- - -
-

rd'/rs1' and rs2' are from the standard 8-register set x8-x15.

-
-
-
-
-

Prerequisites:

-
-
-

M or Zmmul must be configured.

-
-
- - - - - -
- - -
-

The SAIL module variable for rd'/rs1' is called rsdc, and for rs2' is called rs2c.

-
-
-
-
-

Operation:

-
-
-
-
let result_wide = to_bits(2 * sizeof(xlen), signed(X(rsdc)) * signed(X(rs2c)));
-X(rsdc) = result_wide[(sizeof(xlen) - 1) .. 0];
-
-
-
-
-
-
-

29.13. PUSH/POP register instructions

-
-

These instructions are collectively referred to as PUSH/POP:

-
-
- -
-
-

The term PUSH refers to cm.push.

-
-
-

The term POP refers to cm.pop.

-
-
-

The term POPRET refers to cm.popret and cm.popretz.

-
-
-

Common details for these instructions are in this section.

-
-
-

29.13.1. PUSH/POP functional overview

-
-

PUSH, POP, POPRET are used to reduce the size of function prologues and epilogues.

-
-
-
    -
  1. -

    The PUSH instruction

    -
    -
      -
    • -

      adjusts the stack pointer to create the stack frame

      -
    • -
    • -

      pushes (stores) the registers specified in the register list to the stack frame

      -
    • -
    -
    -
  2. -
  3. -

    The POP instruction

    -
    -
      -
    • -

      pops (loads) the registers in the register list from the stack frame

      -
    • -
    • -

      adjusts the stack pointer to destroy the stack frame

      -
    • -
    -
    -
  4. -
  5. -

    The POPRET instructions

    -
    -
      -
    • -

      pop (load) the registers in the register list from the stack frame

      -
    • -
    • -

      cm.popretz also moves zero into a0 as the return value

      -
    • -
    • -

      adjust the stack pointer to destroy the stack frame

      -
    • -
    • -

      execute a ret instruction to return from the function

      -
    • -
    -
    -
  6. -
-
-
-
-
-

29.13.2. Example usage

-
-

This example gives an illustration of the use of PUSH and POPRET.

-
-
-

The function processMarkers in the EMBench benchmark picojpeg in the following file on github: libpicojpeg.c

-
-
-

The prologue and epilogue compile with GCC10 to:

-
-
-
-
   0001098a <processMarkers>:
-   1098a:       711d                    addi    sp,sp,-96 ;#cm.push(1)
-   1098c:       c8ca                    sw      s2,80(sp) ;#cm.push(2)
-   1098e:       c6ce                    sw      s3,76(sp) ;#cm.push(3)
-   10990:       c4d2                    sw      s4,72(sp) ;#cm.push(4)
-   10992:       ce86                    sw      ra,92(sp) ;#cm.push(5)
-   10994:       cca2                    sw      s0,88(sp) ;#cm.push(6)
-   10996:       caa6                    sw      s1,84(sp) ;#cm.push(7)
-   10998:       c2d6                    sw      s5,68(sp) ;#cm.push(8)
-   1099a:       c0da                    sw      s6,64(sp) ;#cm.push(9)
-   1099c:       de5e                    sw      s7,60(sp) ;#cm.push(10)
-   1099e:       dc62                    sw      s8,56(sp) ;#cm.push(11)
-   109a0:       da66                    sw      s9,52(sp) ;#cm.push(12)
-   109a2:       d86a                    sw      s10,48(sp);#cm.push(13)
-   109a4:       d66e                    sw      s11,44(sp);#cm.push(14)
-...
-   109f4:       4501                    li      a0,0      ;#cm.popretz(1)
-   109f6:       40f6                    lw      ra,92(sp) ;#cm.popretz(2)
-   109f8:       4466                    lw      s0,88(sp) ;#cm.popretz(3)
-   109fa:       44d6                    lw      s1,84(sp) ;#cm.popretz(4)
-   109fc:       4946                    lw      s2,80(sp) ;#cm.popretz(5)
-   109fe:       49b6                    lw      s3,76(sp) ;#cm.popretz(6)
-   10a00:       4a26                    lw      s4,72(sp) ;#cm.popretz(7)
-   10a02:       4a96                    lw      s5,68(sp) ;#cm.popretz(8)
-   10a04:       4b06                    lw      s6,64(sp) ;#cm.popretz(9)
-   10a06:       5bf2                    lw      s7,60(sp) ;#cm.popretz(10)
-   10a08:       5c62                    lw      s8,56(sp) ;#cm.popretz(11)
-   10a0a:       5cd2                    lw      s9,52(sp) ;#cm.popretz(12)
-   10a0c:       5d42                    lw      s10,48(sp);#cm.popretz(13)
-   10a0e:       5db2                    lw      s11,44(sp);#cm.popretz(14)
-   10a10:       6125                    addi    sp,sp,96  ;#cm.popretz(15)
-   10a12:       8082                    ret               ;#cm.popretz(16)
-
-
-
-
-

with the GCC option -msave-restore the output is the following:

-
-
-
-
0001080e <processMarkers>:
-   1080e:       73a012ef                jal     t0,11f48 <__riscv_save_12>
-   10812:       1101                    addi    sp,sp,-32
-...
-   10862:       4501                    li      a0,0
-   10864:       6105                    addi    sp,sp,32
-   10866:       71e0106f                j       11f84 <__riscv_restore_12>
-
-
-
-

with PUSH/POPRET this reduces to

-
-
-
-
0001080e <processMarkers>:
-   1080e:       b8fa                    cm.push    {ra,s0-s11},-96
-...
-   10866:       bcfa                    cm.popretz {ra,s0-s11}, 96
-
-
-
-

The prologue / epilogue reduce from 60-bytes in the original code, to 14-bytes with -msave-restore, -and to 4-bytes with PUSH and POPRET. -As well as reducing the code-size PUSH and POPRET eliminate the branches from -calling the millicode save/restore routines and so may also perform better.

-
-
- - - - - -
- - -
-

The calls to <riscv_save_0>/<riscv_restore_0> become 64-bit when the target functions are out of the ±1MB range, increasing the prologue/epilogue size to 22-bytes.

-
-
-
-
- - - - - -
- - -
-

POP is typically used in tail-calling sequences where ret is not used to return to ra after destroying the stack frame.

-
-
-
-
-
Stack pointer adjustment handling
-
-

The instructions all automatically adjust the stack pointer by enough to cover the memory required for the registers being saved or restored. -Additionally the spimm field in the encoding allows the stack pointer to be adjusted in additional increments of 16-bytes. There is only a small restricted -range available in the encoding; if the range is insufficient then a separate c.addi16sp can be used to increase the range.

-
-
-
-
Register list handling
-
-

There is no support for the {ra, s0-s10} register list without also adding s11. Therefore the {ra, s0-s11} register list must be used in this case.

-
-
-
-
-

29.13.3. PUSH/POP Fault handling

-
-

Correct execution requires that sp refers to idempotent memory (also see Non-idempotent memory handling), because the core must be able to -handle traps detected during the sequence. -The entire PUSH/POP sequence is re-executed after returning from the trap handler, and multiple traps are possible during the sequence.

-
-
-

If a trap occurs during the sequence then xEPC is updated with the PC of the instruction, xTVAL (if not read-only-zero) updated with the bad address if it was an access fault and xCAUSE updated with the type of trap.

-
-
- - - - - -
- - -It is implementation defined whether interrupts can also be taken during the sequence execution. -
-
-
-
-

29.13.4. Software view of execution

-
-
Software view of the PUSH sequence
-
-

From a software perspective the PUSH sequence appears as:

-
-
-
    -
  • -

    A sequence of stores writing the bytes required by the pseudocode

    -
    -
      -
    • -

      The bytes may be written in any order.

      -
    • -
    • -

      The bytes may be grouped into larger accesses.

      -
    • -
    • -

      Any of the bytes may be written multiple times.

      -
    • -
    -
    -
  • -
  • -

    A stack pointer adjustment

    -
  • -
-
-
- - - - - -
- - -
-

If an implementation allows interrupts during the sequence, and the interrupt handler uses sp to allocate stack memory, then any stores which were executed before the interrupt may be overwritten by the handler. This is safe because the memory is idempotent and the stores will be re-executed when execution resumes.

-
-
-
-
-

The stack pointer adjustment must only be committed only when it is certain that the entire PUSH instruction will commit.

-
-
-

Stores may also return imprecise faults from the bus. -It is platform defined whether the core implementation waits for the bus responses before continuing to the final stage of the sequence, -or handles errors responses after completing the PUSH instruction.

-
-
-
-

For example:

-
-
-
-
cm.push  {ra, s0-s5}, -64
-
-
-
-

Appears to software as:

-
-
-
-
# any bytes from sp-1 to sp-28 may be written multiple times before
-# the instruction completes therefore these updates may be visible in
-# the interrupt/exception handler below the stack pointer
-sw  s5, -4(sp)
-sw  s4, -8(sp)
-sw  s3,-12(sp)
-sw  s2,-16(sp)
-sw  s1,-20(sp)
-sw  s0,-24(sp)
-sw  ra,-28(sp)
-
-# this must only execute once, and will only execute after all stores
-# completed without any precise faults, therefore this update is only
-# visible in the interrupt/exception handler if cm.push has completed
-addi sp, sp, -64
-
-
-
-
-
Software view of the POP/POPRET sequence
-
-

From a software perspective the POP/POPRET sequence appears as:

-
-
-
    -
  • -

    A sequence of loads reading the bytes required by the pseudocode.

    -
    -
      -
    • -

      The bytes may be loaded in any order.

      -
    • -
    • -

      The bytes may be grouped into larger accesses.

      -
    • -
    • -

      Any of the bytes may be loaded multiple times.

      -
    • -
    -
    -
  • -
  • -

    A stack pointer adjustment

    -
  • -
  • -

    An optional li a0, 0

    -
  • -
  • -

    An optional ret

    -
  • -
-
-
-

If a trap occurs during the sequence, then any loads which were executed before the trap may update architectural state. -The loads will be re-executed once the trap handler completes, so the values will be overwritten. -Therefore it is permitted for an implementation to update some of the destination registers before taking a fault.

-
-
-

The optional li a0, 0, stack pointer adjustment and optional ret must only be committed only when it is certain that the entire POP/POPRET instruction will commit.

-
-
-

For POPRET once the stack pointer adjustment has been committed the ret must execute.

-
-
-
-

For example:

-
-
-
-
cm.popretz {ra, s0-s3}, 32;
-
-
-
-

Appears to software as:

-
-
-
-
# any or all of these load instructions may execute multiple times
-# therefore these updates may be visible in the interrupt/exception handler
-lw   s3, 28(sp)
-lw   s2, 24(sp)
-lw   s1, 20(sp)
-lw   s0, 16(sp)
-lw   ra, 12(sp)
-
-# these must only execute once, will only execute after all loads
-# complete successfully all instructions must execute atomically
-# therefore these updates are not visible in the interrupt/exception handler
-li a0, 0
-addi sp, sp, 32
-ret
-
-
-
-
-
-

29.13.5. Non-idempotent memory handling

-
-

An implementation may have a requirement to issue a PUSH/POP instruction to non-idempotent memory.

-
-
-

If the core implementation does not support PUSH/POP to non-idempotent memories, the core may use an idempotency PMA to detect it and take a -load (POP/POPRET) or store (PUSH) access fault exception in order to avoid unpredictable results.

-
-
-

Software should only use these instructions on non-idempotent memory regions when software can tolerate the required memory accesses -being issued repeatedly in the case that they cause exceptions.

-
-
-
-
-

29.13.6. Example RV32I PUSH/POP sequences

-
-

The examples are included show the load/store series expansion and the stack adjustment. -Examples of cm.popret and cm.popretz are not included, as the difference in the expanded sequence from cm.pop is trivial in all cases.

-
-
-
cm.push {ra, s0-s2}, -64
-
-

Encoding: rlist=7, spimm=3

-
-
-

expands to:

-
-
-
-
sw  s2,  -4(sp);
-sw  s1,  -8(sp);
-sw  s0, -12(sp);
-sw  ra, -16(sp);
-addi sp, sp, -64;
-
-
-
-
-
cm.push {ra, s0-s11}, -112
-
-

Encoding: rlist=15, spimm=3

-
-
-

expands to:

-
-
-
-
sw  s11,  -4(sp);
-sw  s10,  -8(sp);
-sw  s9,  -12(sp);
-sw  s8,  -16(sp);
-sw  s7,  -20(sp);
-sw  s6,  -24(sp);
-sw  s5,  -28(sp);
-sw  s4,  -32(sp);
-sw  s3,  -36(sp);
-sw  s2,  -40(sp);
-sw  s1,  -44(sp);
-sw  s0,  -48(sp);
-sw  ra,  -52(sp);
-addi sp, sp, -112;
-
-
-
-
-
-
cm.pop {ra}, 16
-
-

Encoding: rlist=4, spimm=0

-
-
-

expands to:

-
-
-
-
lw   ra, 12(sp);
-addi sp, sp, 16;
-
-
-
-
-
cm.pop {ra, s0-s3}, 48
-
-

Encoding: rlist=8, spimm=1

-
-
-

expands to:

-
-
-
-
lw   s3, 44(sp);
-lw   s2, 40(sp);
-lw   s1, 36(sp);
-lw   s0, 32(sp);
-lw   ra, 28(sp);
-addi sp, sp, 48;
-
-
-
-
-
cm.pop {ra, s0-s4}, 64
-
-

Encoding: rlist=9, spimm=2

-
-
-

expands to:

-
-
-
-
lw   s4, 60(sp);
-lw   s3, 56(sp);
-lw   s2, 52(sp);
-lw   s1, 48(sp);
-lw   s0, 44(sp);
-lw   ra, 40(sp);
-addi sp, sp, 64;
-
-
-
-
-
-
-

29.13.7. cm.push

-
-

Synopsis:

-
-
-

Create stack frame: store ra and 0 to 12 saved registers to the stack frame, optionally allocate additional stack space.

-
-
-

Mnemonic:

-
-
-

cm.push {reg_list}, -stack_adj

-
-
-

Encoding (RV32, RV64):

-
-
-
-Diagram -
-
-
- - - - - -
- - -
-

rlist values 0 to 3 are reserved for a future EABI variant called cm.push.e

-
-
-
-
-

Assembly Syntax:

-
-
-
-
cm.push {reg_list},  -stack_adj
-cm.push {xreg_list}, -stack_adj
-
-
-
-

The variables used in the assembly syntax are defined below.

-
-
-
-
RV32E:
-
-switch (rlist){
-  case  4: {reg_list="ra";         xreg_list="x1";}
-  case  5: {reg_list="ra, s0";     xreg_list="x1, x8";}
-  case  6: {reg_list="ra, s0-s1";  xreg_list="x1, x8-x9";}
-  default: reserved();
-}
-stack_adj      = stack_adj_base + spimm[5:4] * 16;
-
-
-
-
-
RV32I, RV64:
-
-switch (rlist){
-  case  4: {reg_list="ra";         xreg_list="x1";}
-  case  5: {reg_list="ra, s0";     xreg_list="x1, x8";}
-  case  6: {reg_list="ra, s0-s1";  xreg_list="x1, x8-x9";}
-  case  7: {reg_list="ra, s0-s2";  xreg_list="x1, x8-x9, x18";}
-  case  8: {reg_list="ra, s0-s3";  xreg_list="x1, x8-x9, x18-x19";}
-  case  9: {reg_list="ra, s0-s4";  xreg_list="x1, x8-x9, x18-x20";}
-  case 10: {reg_list="ra, s0-s5";  xreg_list="x1, x8-x9, x18-x21";}
-  case 11: {reg_list="ra, s0-s6";  xreg_list="x1, x8-x9, x18-x22";}
-  case 12: {reg_list="ra, s0-s7";  xreg_list="x1, x8-x9, x18-x23";}
-  case 13: {reg_list="ra, s0-s8";  xreg_list="x1, x8-x9, x18-x24";}
-  case 14: {reg_list="ra, s0-s9";  xreg_list="x1, x8-x9, x18-x25";}
-  //note - to include s10, s11 must also be included
-  case 15: {reg_list="ra, s0-s11"; xreg_list="x1, x8-x9, x18-x27";}
-  default: reserved();
-}
-stack_adj      = stack_adj_base + spimm[5:4] * 16;
-
-
-
-
-
RV32E:
-
-stack_adj_base = 16;
-Valid values:
-stack_adj      = [16|32|48|64];
-
-
-
-
-
RV32I:
-
-switch (rlist) {
-  case  4.. 7: stack_adj_base = 16;
-  case  8..11: stack_adj_base = 32;
-  case 12..14: stack_adj_base = 48;
-  case     15: stack_adj_base = 64;
-}
-
-Valid values:
-switch (rlist) {
-  case  4.. 7: stack_adj = [16|32|48| 64];
-  case  8..11: stack_adj = [32|48|64| 80];
-  case 12..14: stack_adj = [48|64|80| 96];
-  case     15: stack_adj = [64|80|96|112];
-}
-
-
-
-
-
RV64:
-
-switch (rlist) {
-  case  4.. 5: stack_adj_base =  16;
-  case  6.. 7: stack_adj_base =  32;
-  case  8.. 9: stack_adj_base =  48;
-  case 10..11: stack_adj_base =  64;
-  case 12..13: stack_adj_base =  80;
-  case     14: stack_adj_base =  96;
-  case     15: stack_adj_base = 112;
-}
-
-Valid values:
-switch (rlist) {
-  case  4.. 5: stack_adj = [ 16| 32| 48| 64];
-  case  6.. 7: stack_adj = [ 32| 48| 64| 80];
-  case  8.. 9: stack_adj = [ 48| 64| 80| 96];
-  case 10..11: stack_adj = [ 64| 80| 96|112];
-  case 12..13: stack_adj = [ 80| 96|112|128];
-  case     14: stack_adj = [ 96|112|128|144];
-  case     15: stack_adj = [112|128|144|160];
-}
-
-
-
-
-

Description:

-
-
-

This instruction pushes (stores) the registers in reg_list to the memory below the stack pointer, -and then creates the stack frame by decrementing the stack pointer by stack_adj, -including any additional stack space requested by the value of spimm.

-
-
- - - - - -
- - -
-

All ABI register mappings are for the UABI. An EABI version is planned once the EABI is frozen.

-
-
-
-
-

For further information see PUSH/POP Register Instructions.

-
-
-

Stack Adjustment Calculation:

-
-
-

stack_adj_base is the minimum number of bytes, in multiples of 16-byte address increments, required to cover the registers in the list.

-
-
-

spimm is the number of additional 16-byte address increments allocated for the stack frame.

-
-
-

The total stack adjustment represents the total size of the stack frame, which is stack_adj_base added to spimm scaled by 16, -as defined above.

-
-
-

Prerequisites:

-
-
-

None

-
-
-

32-bit equivalent:

-
-
-

No direct equivalent encoding exists

-
-
-

Operation:

-
-
-

The first section of pseudocode may be executed multiple times before the instruction successfully completes.

-
-
-
-
//This is not SAIL, it's pseudocode. The SAIL hasn't been written yet.
-
-if (XLEN==32) bytes=4; else bytes=8;
-
-addr=sp-bytes;
-for(i in 27,26,25,24,23,22,21,20,19,18,9,8,1)  {
-  //if register i is in xreg_list
-  if (xreg_list[i]) {
-    switch(bytes) {
-      4:  asm("sw x[i], 0(addr)");
-      8:  asm("sd x[i], 0(addr)");
-    }
-    addr-=bytes;
-  }
-}
-
-
-
-

The final section of pseudocode executes atomically, and only executes if the section above completes without any exceptions or interrupts.

-
-
-
-
//This is not SAIL, it's pseudocode. The SAIL hasn't been written yet.
-
-sp-=stack_adj;
-
-
-
-
-
-

29.13.8. cm.pop

-
-

Synopsis:

-
-
-

Destroy stack frame: load ra and 0 to 12 saved registers from the stack frame, deallocate the stack frame.

-
-
-

Mnemonic:

-
-
-

cm.pop {reg_list}, stack_adj

-
-
-

Encoding (RV32, RV64):

-
-
-
-Diagram -
-
-
- - - - - -
- - -
-

rlist values 0 to 3 are reserved for a future EABI variant called cm.pop.e

-
-
-
-
-

Assembly Syntax:

-
-
-
-
cm.pop {reg_list},  stack_adj
-cm.pop {xreg_list}, stack_adj
-
-
-
-

The variables used in the assembly syntax are defined below.

-
-
-
-
RV32E:
-
-switch (rlist){
-  case  4: {reg_list="ra";         xreg_list="x1";}
-  case  5: {reg_list="ra, s0";     xreg_list="x1, x8";}
-  case  6: {reg_list="ra, s0-s1";  xreg_list="x1, x8-x9";}
-  default: reserved();
-}
-stack_adj      = stack_adj_base + spimm[5:4] * 16;
-
-
-
-
-
RV32I, RV64:
-
-switch (rlist){
-  case  4: {reg_list="ra";         xreg_list="x1";}
-  case  5: {reg_list="ra, s0";     xreg_list="x1, x8";}
-  case  6: {reg_list="ra, s0-s1";  xreg_list="x1, x8-x9";}
-  case  7: {reg_list="ra, s0-s2";  xreg_list="x1, x8-x9, x18";}
-  case  8: {reg_list="ra, s0-s3";  xreg_list="x1, x8-x9, x18-x19";}
-  case  9: {reg_list="ra, s0-s4";  xreg_list="x1, x8-x9, x18-x20";}
-  case 10: {reg_list="ra, s0-s5";  xreg_list="x1, x8-x9, x18-x21";}
-  case 11: {reg_list="ra, s0-s6";  xreg_list="x1, x8-x9, x18-x22";}
-  case 12: {reg_list="ra, s0-s7";  xreg_list="x1, x8-x9, x18-x23";}
-  case 13: {reg_list="ra, s0-s8";  xreg_list="x1, x8-x9, x18-x24";}
-  case 14: {reg_list="ra, s0-s9";  xreg_list="x1, x8-x9, x18-x25";}
-  //note - to include s10, s11 must also be included
-  case 15: {reg_list="ra, s0-s11"; xreg_list="x1, x8-x9, x18-x27";}
-  default: reserved();
-}
-stack_adj      = stack_adj_base + spimm[5:4] * 16;
-
-
-
-
-
RV32E:
-
-stack_adj_base = 16;
-Valid values:
-stack_adj      = [16|32|48|64];
-
-
-
-
-
RV32I:
-
-switch (rlist) {
-  case  4.. 7: stack_adj_base = 16;
-  case  8..11: stack_adj_base = 32;
-  case 12..14: stack_adj_base = 48;
-  case     15: stack_adj_base = 64;
-}
-
-Valid values:
-switch (rlist) {
-  case  4.. 7: stack_adj = [16|32|48| 64];
-  case  8..11: stack_adj = [32|48|64| 80];
-  case 12..14: stack_adj = [48|64|80| 96];
-  case     15: stack_adj = [64|80|96|112];
-}
-
-
-
-
-
RV64:
-
-switch (rlist) {
-  case  4.. 5: stack_adj_base =  16;
-  case  6.. 7: stack_adj_base =  32;
-  case  8.. 9: stack_adj_base =  48;
-  case 10..11: stack_adj_base =  64;
-  case 12..13: stack_adj_base =  80;
-  case     14: stack_adj_base =  96;
-  case     15: stack_adj_base = 112;
-}
-
-Valid values:
-switch (rlist) {
-  case  4.. 5: stack_adj = [ 16| 32| 48| 64];
-  case  6.. 7: stack_adj = [ 32| 48| 64| 80];
-  case  8.. 9: stack_adj = [ 48| 64| 80| 96];
-  case 10..11: stack_adj = [ 64| 80| 96|112];
-  case 12..13: stack_adj = [ 80| 96|112|128];
-  case     14: stack_adj = [ 96|112|128|144];
-  case     15: stack_adj = [112|128|144|160];
-}
-
-
-
-
-

Description:

-
-
-

This instruction pops (loads) the registers in reg_list from stack memory, -and then adjusts the stack pointer by stack_adj.

-
-
- - - - - -
- - -
-

All ABI register mappings are for the UABI. An EABI version is planned once the EABI is frozen.

-
-
-
-
-

For further information see PUSH/POP Register Instructions.

-
-
-

Stack Adjustment Calculation:

-
-
-

stack_adj_base is the minimum number of bytes, in multiples of 16-byte address increments, required to cover the registers in the list.

-
-
-

spimm is the number of additional 16-byte address increments allocated for the stack frame.

-
-
-

The total stack adjustment represents the total size of the stack frame, which is stack_adj_base added to spimm scaled by 16, -as defined above.

-
-
-

Prerequisites:

-
-
-

None

-
-
-

32-bit equivalent:

-
-
-

No direct equivalent encoding exists

-
-
-

Operation:

-
-
-

The first section of pseudocode may be executed multiple times before the instruction successfully completes.

-
-
-
-
//This is not SAIL, it's pseudocode. The SAIL hasn't been written yet.
-
-if (XLEN==32) bytes=4; else bytes=8;
-
-addr=sp+stack_adj-bytes;
-for(i in 27,26,25,24,23,22,21,20,19,18,9,8,1)  {
-  //if register i is in xreg_list
-  if (xreg_list[i]) {
-    switch(bytes) {
-      4:  asm("lw x[i], 0(addr)");
-      8:  asm("ld x[i], 0(addr)");
-    }
-    addr-=bytes;
-  }
-}
-
-
-
-

The final section of pseudocode executes atomically, and only executes if the section above completes without any exceptions or interrupts.

-
-
-
-
//This is not SAIL, it's pseudocode. The SAIL hasn't been written yet.
-
-sp+=stack_adj;
-
-
-
-
-
-

29.13.9. cm.popretz

-
-

Synopsis:

-
-
-

Destroy stack frame: load ra and 0 to 12 saved registers from the stack frame, deallocate the stack frame, move zero into a0, return to ra.

-
-
-

Mnemonic:

-
-
-

cm.popretz {reg_list}, stack_adj

-
-
-

Encoding (RV32, RV64):

-
-
-
-Diagram -
-
-
- - - - - -
- - -
-

rlist values 0 to 3 are reserved for a future EABI variant called cm.popretz.e

-
-
-
-
-

Assembly Syntax:

-
-
-
-
cm.popretz {reg_list},  stack_adj
-cm.popretz {xreg_list}, stack_adj
-
-
-
-
-
RV32E:
-
-switch (rlist){
-  case  4: {reg_list="ra";         xreg_list="x1";}
-  case  5: {reg_list="ra, s0";     xreg_list="x1, x8";}
-  case  6: {reg_list="ra, s0-s1";  xreg_list="x1, x8-x9";}
-  default: reserved();
-}
-stack_adj      = stack_adj_base + spimm[5:4] * 16;
-
-
-
-
-
RV32I, RV64:
-
-switch (rlist){
-  case  4: {reg_list="ra";         xreg_list="x1";}
-  case  5: {reg_list="ra, s0";     xreg_list="x1, x8";}
-  case  6: {reg_list="ra, s0-s1";  xreg_list="x1, x8-x9";}
-  case  7: {reg_list="ra, s0-s2";  xreg_list="x1, x8-x9, x18";}
-  case  8: {reg_list="ra, s0-s3";  xreg_list="x1, x8-x9, x18-x19";}
-  case  9: {reg_list="ra, s0-s4";  xreg_list="x1, x8-x9, x18-x20";}
-  case 10: {reg_list="ra, s0-s5";  xreg_list="x1, x8-x9, x18-x21";}
-  case 11: {reg_list="ra, s0-s6";  xreg_list="x1, x8-x9, x18-x22";}
-  case 12: {reg_list="ra, s0-s7";  xreg_list="x1, x8-x9, x18-x23";}
-  case 13: {reg_list="ra, s0-s8";  xreg_list="x1, x8-x9, x18-x24";}
-  case 14: {reg_list="ra, s0-s9";  xreg_list="x1, x8-x9, x18-x25";}
-  //note - to include s10, s11 must also be included
-  case 15: {reg_list="ra, s0-s11"; xreg_list="x1, x8-x9, x18-x27";}
-  default: reserved();
-}
-stack_adj      = stack_adj_base + spimm[5:4] * 16;
-
-
-
-
-
RV32E:
-
-stack_adj_base = 16;
-Valid values:
-stack_adj      = [16|32|48|64];
-
-
-
-
-
RV32I:
-
-switch (rlist) {
-  case  4.. 7: stack_adj_base = 16;
-  case  8..11: stack_adj_base = 32;
-  case 12..14: stack_adj_base = 48;
-  case     15: stack_adj_base = 64;
-}
-
-Valid values:
-switch (rlist) {
-  case  4.. 7: stack_adj = [16|32|48| 64];
-  case  8..11: stack_adj = [32|48|64| 80];
-  case 12..14: stack_adj = [48|64|80| 96];
-  case     15: stack_adj = [64|80|96|112];
-}
-
-
-
-
-
RV64:
-
-switch (rlist) {
-  case  4.. 5: stack_adj_base =  16;
-  case  6.. 7: stack_adj_base =  32;
-  case  8.. 9: stack_adj_base =  48;
-  case 10..11: stack_adj_base =  64;
-  case 12..13: stack_adj_base =  80;
-  case     14: stack_adj_base =  96;
-  case     15: stack_adj_base = 112;
-}
-
-Valid values:
-switch (rlist) {
-  case  4.. 5: stack_adj = [ 16| 32| 48| 64];
-  case  6.. 7: stack_adj = [ 32| 48| 64| 80];
-  case  8.. 9: stack_adj = [ 48| 64| 80| 96];
-  case 10..11: stack_adj = [ 64| 80| 96|112];
-  case 12..13: stack_adj = [ 80| 96|112|128];
-  case     14: stack_adj = [ 96|112|128|144];
-  case     15: stack_adj = [112|128|144|160];
-}
-
-
-
-
-

Description:

-
-
-

This instruction pops (loads) the registers in reg_list from stack memory, adjusts the stack pointer by stack_adj, moves zero into a0 and then returns to ra.

-
-
- - - - - -
- - -
-

All ABI register mappings are for the UABI. An EABI version is planned once the EABI is frozen.

-
-
-
-
-

For further information see PUSH/POP Register Instructions.

-
-
-

Stack Adjustment Calculation:

-
-
-

stack_adj_base is the minimum number of bytes, in multiples of 16-byte address increments, required to cover the registers in the list.

-
-
-

spimm is the number of additional 16-byte address increments allocated for the stack frame.

-
-
-

The total stack adjustment represents the total size of the stack frame, which is stack_adj_base added to spimm scaled by 16, as defined above.

-
-
-

Prerequisites:

-
-
-

None

-
-
-

32-bit equivalent:

-
-
-

No direct equivalent encoding exists

-
-
-

Operation:

-
-
-

The first section of pseudocode may be executed multiple times before the instruction successfully completes.

-
-
-
-
//This is not SAIL, it's pseudocode. The SAIL hasn't been written yet.
-
-if (XLEN==32) bytes=4; else bytes=8;
-
-addr=sp+stack_adj-bytes;
-for(i in 27,26,25,24,23,22,21,20,19,18,9,8,1)  {
-  //if register i is in xreg_list
-  if (xreg_list[i]) {
-    switch(bytes) {
-      4:  asm("lw x[i], 0(addr)");
-      8:  asm("ld x[i], 0(addr)");
-    }
-    addr-=bytes;
-  }
-}
-
-
-
-

The final section of pseudocode executes atomically, and only executes if the section above completes without any exceptions or interrupts.

-
-
- - - - - -
- - -
-

The li a0, 0 could be executed more than once, but is included in the atomic section for convenience.

-
-
-
-
-
-
//This is not SAIL, it's pseudocode. The SAIL hasn't been written yet.
-
-asm("li a0, 0");
-sp+=stack_adj;
-asm("ret");
-
-
-
-
-
-

29.13.10. cm.popret

-
-

Synopsis:

-
-
-

Destroy stack frame: load ra and 0 to 12 saved registers from the stack frame, deallocate the stack frame, return to ra.

-
-
-

Mnemonic:

-
-
-

cm.popret {reg_list}, stack_adj

-
-
-

Encoding (RV32, RV64):

-
-
-
-Diagram -
-
-
- - - - - -
- - -
-

rlist values 0 to 3 are reserved for a future EABI variant called cm.popret.e

-
-
-
-
-

Assembly Syntax:

-
-
-
-
cm.popret {reg_list},  stack_adj
-cm.popret {xreg_list}, stack_adj
-
-
-
-

The variables used in the assembly syntax are defined below.

-
-
-
-
RV32E:
-
-switch (rlist){
-  case  4: {reg_list="ra";         xreg_list="x1";}
-  case  5: {reg_list="ra, s0";     xreg_list="x1, x8";}
-  case  6: {reg_list="ra, s0-s1";  xreg_list="x1, x8-x9";}
-  default: reserved();
-}
-stack_adj      = stack_adj_base + spimm[5:4] * 16;
-
-
-
-
-
RV32I, RV64:
-
-switch (rlist){
-  case  4: {reg_list="ra";         xreg_list="x1";}
-  case  5: {reg_list="ra, s0";     xreg_list="x1, x8";}
-  case  6: {reg_list="ra, s0-s1";  xreg_list="x1, x8-x9";}
-  case  7: {reg_list="ra, s0-s2";  xreg_list="x1, x8-x9, x18";}
-  case  8: {reg_list="ra, s0-s3";  xreg_list="x1, x8-x9, x18-x19";}
-  case  9: {reg_list="ra, s0-s4";  xreg_list="x1, x8-x9, x18-x20";}
-  case 10: {reg_list="ra, s0-s5";  xreg_list="x1, x8-x9, x18-x21";}
-  case 11: {reg_list="ra, s0-s6";  xreg_list="x1, x8-x9, x18-x22";}
-  case 12: {reg_list="ra, s0-s7";  xreg_list="x1, x8-x9, x18-x23";}
-  case 13: {reg_list="ra, s0-s8";  xreg_list="x1, x8-x9, x18-x24";}
-  case 14: {reg_list="ra, s0-s9";  xreg_list="x1, x8-x9, x18-x25";}
-  //note - to include s10, s11 must also be included
-  case 15: {reg_list="ra, s0-s11"; xreg_list="x1, x8-x9, x18-x27";}
-  default: reserved();
-}
-stack_adj      = stack_adj_base + spimm[5:4] * 16;
-
-
-
-
-
RV32E:
-
-stack_adj_base = 16;
-Valid values:
-stack_adj      = [16|32|48|64];
-
-
-
-
-
RV32I:
-
-switch (rlist) {
-  case  4.. 7: stack_adj_base = 16;
-  case  8..11: stack_adj_base = 32;
-  case 12..14: stack_adj_base = 48;
-  case     15: stack_adj_base = 64;
-}
-
-Valid values:
-switch (rlist) {
-  case  4.. 7: stack_adj = [16|32|48| 64];
-  case  8..11: stack_adj = [32|48|64| 80];
-  case 12..14: stack_adj = [48|64|80| 96];
-  case     15: stack_adj = [64|80|96|112];
-}
-
-
-
-
-
RV64:
-
-switch (rlist) {
-  case  4.. 5: stack_adj_base =  16;
-  case  6.. 7: stack_adj_base =  32;
-  case  8.. 9: stack_adj_base =  48;
-  case 10..11: stack_adj_base =  64;
-  case 12..13: stack_adj_base =  80;
-  case     14: stack_adj_base =  96;
-  case     15: stack_adj_base = 112;
-}
-
-Valid values:
-switch (rlist) {
-  case  4.. 5: stack_adj = [ 16| 32| 48| 64];
-  case  6.. 7: stack_adj = [ 32| 48| 64| 80];
-  case  8.. 9: stack_adj = [ 48| 64| 80| 96];
-  case 10..11: stack_adj = [ 64| 80| 96|112];
-  case 12..13: stack_adj = [ 80| 96|112|128];
-  case     14: stack_adj = [ 96|112|128|144];
-  case     15: stack_adj = [112|128|144|160];
-}
-
-
-
-
-

Description:

-
-
-

This instruction pops (loads) the registers in reg_list from stack memory, adjusts the stack pointer by stack_adj and then returns to ra.

-
-
- - - - - -
- - -
-

All ABI register mappings are for the UABI. An EABI version is planned once the EABI is frozen.

-
-
-
-
-

For further information see PUSH/POP Register Instructions.

-
-
-

Stack Adjustment Calculation:

-
-
-

stack_adj_base is the minimum number of bytes, in multiples of 16-byte address increments, required to cover the registers in the list.

-
-
-

spimm is the number of additional 16-byte address increments allocated for the stack frame.

-
-
-

The total stack adjustment represents the total size of the stack frame, which is stack_adj_base added to spimm scaled by 16, as defined above.

-
-
-

Prerequisites:

-
-
-

None

-
-
-

32-bit equivalent:

-
-
-

No direct equivalent encoding exists

-
-
-

Operation:

-
-
-

The first section of pseudocode may be executed multiple times before the instruction successfully completes.

-
-
-
-
//This is not SAIL, it's pseudocode. The SAIL hasn't been written yet.
-
-if (XLEN==32) bytes=4; else bytes=8;
-
-addr=sp+stack_adj-bytes;
-for(i in 27,26,25,24,23,22,21,20,19,18,9,8,1)  {
-  //if register i is in xreg_list
-  if (xreg_list[i]) {
-    switch(bytes) {
-      4:  asm("lw x[i], 0(addr)");
-      8:  asm("ld x[i], 0(addr)");
-    }
-    addr-=bytes;
-  }
-}
-
-
-
-

The final section of pseudocode executes atomically, and only executes if the section above completes without any exceptions or interrupts.

-
-
-
-
//This is not SAIL, it's pseudocode. The SAIL hasn't been written yet.
-
-sp+=stack_adj;
-asm("ret");
-
-
-
-
-
-

29.13.11. cm.mvsa01

-
-

Synopsis:

-
-
-

Move a0-a1 into two registers of s0-s7

-
-
-

Mnemonic:

-
-
-

cm.mvsa01 r1s', r2s'

-
-
-

Encoding (RV32, RV64):

-
-
-
-Diagram -
-
-
- - - - - -
- - -
-

For the encoding to be legal r1s' != r2s'.

-
-
-
-
-

Assembly Syntax:

-
-
-
-
cm.mvsa01 r1s', r2s'
-
-
-
-

Description: -This instruction moves a0 into r1s' and a1 into r2s'. r1s' and r2s' must be different. -The execution is atomic, so it is not possible to observe state where only one of r1s' or r2s' has been updated.

-
-
-

The encoding uses sreg number specifiers instead of xreg number specifiers to save encoding space. -The mapping between them is specified in the pseudocode below.

-
-
- - - - - -
- - -
-

The s register mapping is taken from the UABI, and may not match the currently unratified EABI. cm.mvsa01.e may be included in the future.

-
-
-
-
-

Prerequisites:

-
-
-

None

-
-
-

32-bit equivalent:

-
-
-

No direct equivalent encoding exists.

-
-
-

Operation:

-
-
-
-
//This is not SAIL, it's pseudocode. The SAIL hasn't been written yet.
-if (RV32E && (r1sc>1 || r2sc>1)) {
-  reserved();
-}
-xreg1 = {r1sc[2:1]>0,r1sc[2:1]==0,r1sc[2:0]};
-xreg2 = {r2sc[2:1]>0,r2sc[2:1]==0,r2sc[2:0]};
-X[xreg1] = X[10];
-X[xreg2] = X[11];
-
-
-
-
-
-

29.13.12. cm.mva01s

-
-

Synopsis:

-
-
-

Move two s0-s7 registers into a0-a1

-
-
-

Mnemonic:

-
-
-

cm.mva01s r1s', r2s'

-
-
-

Encoding (RV32, RV64):

-
-
-
-Diagram -
-
-
-

Assembly Syntax:

-
-
-
-
cm.mva01s r1s', r2s'
-
-
-
-

Description: -This instruction moves r1s' into a0 and r2s' into a1. -The execution is atomic, so it is not possible to observe state where only one of a0 or a1 have been updated.

-
-
-

The encoding uses sreg number specifiers instead of xreg number specifiers to save encoding space. -The mapping between them is specified in the pseudocode below.

-
-
- - - - - -
- - -
-

The s register mapping is taken from the UABI, and may not match the currently unratified EABI. cm.mva01s.e may be included in the future.

-
-
-
-
-

Prerequisites:

-
-
-

None

-
-
-

32-bit equivalent:

-
-
-

No direct equivalent encoding exists.

-
-
-

Operation:

-
-
-
-
//This is not SAIL, it's pseudocode. The SAIL hasn't been written yet.
-if (RV32E && (r1sc>1 || r2sc>1)) {
-  reserved();
-}
-xreg1 = {r1sc[2:1]>0,r1sc[2:1]==0,r1sc[2:0]};
-xreg2 = {r2sc[2:1]>0,r2sc[2:1]==0,r2sc[2:0]};
-X[10] = X[xreg1];
-X[11] = X[xreg2];
-
-
-
-
-
-
-

29.14. Table Jump Overview

-
-

cm.jt (Jump via table) and cm.jalt (Jump and link via table) are referred to as table jump.

-
-
-

Table jump uses a 256-entry XLEN wide table in instruction memory to contain function addresses. -The table must be a minimum of 64-byte aligned.

-
-
-

Table entries follow the current data endianness. This is different from normal instruction fetch which is always little-endian.

-
-
-

cm.jt and cm.jalt encodings index the table, giving access to functions within the full XLEN wide address space.

-
-
-

This is used as a form of dictionary compression to reduce the code size of jal / auipc+jalr / jr / auipc+jr instructions.

-
-
-

Table jump allows the linker to replace the following instruction sequences with a cm.jt or cm.jalt encoding, and an entry in the table:

-
-
-
    -
  • -

    32-bit j calls

    -
  • -
  • -

    32-bit jal ra calls

    -
  • -
  • -

    64-bit auipc+jr calls to fixed locations

    -
  • -
  • -

    64-bit auipc+jalr ra calls to fixed locations

    -
    -
      -
    • -

      The auipc+jr/jalr sequence is used because the offset from the PC is out of the ±1MB range.

      -
    • -
    -
    -
  • -
-
-
-

If a return address stack is implemented, then as cm.jalt is equivalent to jal ra, it pushes to the stack.

-
-
-

29.14.1. jvt

-
-

The base of the table is in the jvt CSR (see jvt CSR, table jump base vector and control register), each table entry is XLEN bits.

-
-
-

If the same function is called with and without linking then it must have two entries in the table. -This is typically caused by the same function being called with and without tail calling.

-
-
-
-

29.14.2. Table Jump Fault handling

-
-

For a table jump instruction, the table entry that the instruction selects is considered an extension of the instruction itself. -Hence, the execution of a table jump instruction involves two instruction fetches, the first to read the instruction (cm.jt/cm.jalt) -and the second to read from the jump vector table (JVT). Both instruction fetches are implicit reads, and both require -execute permission; read permission is irrelevant. It is recommended that the second fetch be ignored for hardware triggers and breakpoints.

-
-
-

Memory writes to the jump vector table require an instruction barrier (fence.i) to guarantee that they are visible to the instruction fetch.

-
-
-

Multiple contexts may have different jump vector tables. JVT may be switched between them without an instruction barrier -if the tables have not been updated in memory since the last fence.i.

-
-
-

If an exception occurs on either instruction fetch, xEPC is set to the PC of the table jump instruction, xCAUSE is set as expected for the type of fault and xTVAL (if not set to zero) contains the fetch address which caused the fault.

-
-
-
-
-

29.14.3. jvt CSR

-
-

Synopsis:

-
-
-

Table jump base vector and control register

-
-
-

Address:

-
-
-

0x0017

-
-
-

Permissions:

-
-
-

URW

-
-
-

Format (RV32):

-
-
-
-Diagram -
-
-
-

Format (RV64):

-
-
-
-Diagram -
-
-
-

Description:

-
-
-

The jvt register is an XLEN-bit WARL read/write register that holds the jump table configuration, consisting of the jump table base address (BASE) and the jump table mode (MODE).

-
-
-

If Section 29.10 is implemented then jvt must also be implemented, but can contain a read-only value. If jvt is writable, the set of values the register may hold can vary by implementation. The value in the BASE field must always be aligned on a 64-byte boundary.

-
-
-

jvt.base is a virtual address, whenever virtual memory is enabled.

-
-
-

The memory pointed to by jvt.base is treated as instruction memory for the purpose of executing table jump instructions, implying execute access permission.

-
- - ---- - - - - - - - - - - - - - - - - -
Table 25. jvt.mode definition
jvt.modeComment

000000

Jump table mode

others

reserved for future standard use

-
-

jvt.mode is a WARL field, so can only be programmed to modes which are implemented. Therefore the discovery mechanism is to -attempt to program different modes and read back the values to see which are available. Jump table mode must be implemented.

-
-
- - - - - -
- - -
-

in future the RISC-V Unified Discovery method will report the available modes.

-
-
-
-
-

Architectural State:

-
-
-

jvt CSR adds architectural state to the system software context (such as an OS process), therefore must be saved/restored on context switches.

-
-
-

State Enable:

-
-
-

If the Smstateen extension is implemented, then bit 2 in mstateen0, sstateen0, and hstateen0 is implemented. If bit 2 of a controlling stateen0 CSR is zero, then access to the jvt CSR and execution of a cm.jalt or cm.jt instruction by a lower privilege level results in an Illegal Instruction trap (or, if appropriate, a Virtual Instruction trap).

-
-
-
-
-

29.14.4. cm.jt

-
-

Synopsis:

-
-
-

jump via table

-
-
-

Mnemonic:

-
-
-

cm.jt index

-
-
-

Encoding (RV32, RV64):

-
-
-
-Diagram -
-
-
- - - - - -
- - -
-

For this encoding to decode as cm.jt, index<32, otherwise it decodes as cm.jalt, see Jump and link via table.

-
-
-
-
- - - - - -
- - -
-

If jvt.mode = 0 (Jump Table Mode) then cm.jt behaves as specified here. If jvt.mode is a reserved value, then cm.jt is also reserved. In the future other defined values of jvt.mode may change the behaviour of cm.jt.

-
-
-
-
-

Assembly Syntax:

-
-
-
-
cm.jt index
-
-
-
-

Description:

-
-
-

cm.jt reads an entry from the jump vector table in memory and jumps to the address that was read.

-
-
-

For further information see Table Jump Overview.

-
-
-

Prerequisites:

-
-
-

None

-
-
-

32-bit equivalent:

-
-
-

No direct equivalent encoding exists.

-
-
-
-

Operation:

-
-
-
-
//This is not SAIL, it's pseudocode. The SAIL hasn't been written yet.
-
-# target_address is temporary internal state, it doesn't represent a real register
-# InstMemory is byte indexed
-
-switch(XLEN) {
-  32:  table_address[XLEN-1:0] = jvt.base + (index<<2);
-  64:  table_address[XLEN-1:0] = jvt.base + (index<<3);
-}
-
-//fetch from the jump table
-target_address[XLEN-1:0] = InstMemory[table_address][XLEN-1:0];
-
-j target_address[XLEN-1:0]&~0x1;
-
-
-
-
-
-

29.14.5. cm.jalt

-
-

Synopsis:

-
-
-

jump via table with optional link

-
-
-

Mnemonic:

-
-
-

cm.jalt index

-
-
-

Encoding (RV32, RV64):

-
-
-
-Diagram -
-
-
- - - - - -
- - -
-

For this encoding to decode as cm.jalt, index>=32, otherwise it decodes as cm.jt, see Jump via table.

-
-
-
-
- - - - - -
- - -
-

If jvt.mode = 0 (Jump Table Mode) then cm.jalt behaves as specified here. If jvt.mode is a reserved value, then cm.jalt is also reserved. In the future other defined values of jvt.mode may change the behaviour of cm.jalt.

-
-
-
-
-

Assembly Syntax:

-
-
-
-
cm.jalt index
-
-
-
-

Description:

-
-
-

cm.jalt reads an entry from the jump vector table in memory and jumps to the address that was read, linking to ra.

-
-
-

For further information see Table Jump Overview.

-
-
-

Prerequisites:

-
-
-

None

-
-
-

32-bit equivalent:

-
-
-

No direct equivalent encoding exists.

-
-
-
-

Operation:

-
-
-
-
//This is not SAIL, it's pseudocode. The SAIL hasn't been written yet.
-
-# target_address is temporary internal state, it doesn't represent a real register
-# InstMemory is byte indexed
-
-switch(XLEN) {
-  32:  table_address[XLEN-1:0] = jvt.base + (index<<2);
-  64:  table_address[XLEN-1:0] = jvt.base + (index<<3);
-}
-
-//fetch from the jump table
-target_address[XLEN-1:0] = InstMemory[table_address][XLEN-1:0];
-
-jal ra, target_address[XLEN-1:0]&~0x1;
-
-
-
-
-
-
-
-

30. "B" Extension for Bit Manipulation, Version 1.0.0

-
-
-

The B standard extension comprises instructions provided by the Zba, Zbb, and -Zbs extensions.

-
-
-

30.1. Zb* Overview

-
-

The bit-manipulation (bitmanip) extension collection is comprised of several component extensions to the base RISC-V architecture that are intended to provide some combination of code size reduction, performance improvement, and energy reduction. -While the instructions are intended to have general use, some instructions are more useful in some domains than others. -Hence, several smaller bitmanip extensions are provided. Each of these smaller extensions is grouped by common function and use case, and each has its own Zb*-extension name.

-
-
-

Each bitmanip extension includes a group of several bitmanip instructions that have similar purposes and that can often share the same logic. Some instructions are available in only one extension while others are available in several. -The instructions have mnemonics and encodings that are independent of the extensions in which they appear. -Thus, when implementing extensions with overlapping instructions, there is no redundancy in logic or encoding.

-
-
-

The bitmanip extensions are defined for RV32 and RV64. -Most of the instructions are expected to be forward compatible with RV128. -While the shift-immediate instructions are defined to have at most a 6-bit immediate field, a 7th bit is available in the encoding space should this be needed for RV128.

-
-
-
-

30.2. Word Instructions

-
-

The bitmanip extension follows the convention in RV64 that w-suffixed instructions (without a dot before the w) ignore the upper 32 bits of their inputs, operate on the least-significant 32-bits as signed values and produce a 32-bit signed result that is sign-extended to XLEN.

-
-
-

Bitmanip instructions with the suffix .uw have one operand that is an unsigned 32-bit value that is extracted from the least significant 32 bits of the specified register. Other than that, these perform full XLEN operations.

-
-
-

Bitmanip instructions with the suffix .b, .h and .w only look at the least significant 8-bits, 16-bits and 32-bits of the input (respectively) and produce an XLEN-wide result that is sign-extended or zero-extended, based on the specific instruction.

-
-
-
-

30.3. Pseudocode for instruction semantics

-
-

The semantics of each instruction in Instructions (in alphabetical order) is expressed in a SAIL-like syntax.

-
-
-
-

30.4. Extensions

-
-

The first group of bitmanip extensions to be released for Public Review are:

-
- -
-

Below is a list of all of the instructions that are included in these extensions -along with their specific mapping:

-
- ---------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
RV32RV64MnemonicInstructionZbaZbbZbcZbs

add.uw rd, rs1, rs2

Add unsigned word

andn rd, rs1, rs2

AND with inverted operand

clmul rd, rs1, rs2

Carry-less multiply (low-part)

clmulh rd, rs1, rs2

Carry-less multiply (high-part)

clmulr rd, rs1, rs2

Carry-less multiply (reversed)

clz rd, rs

Count leading zero bits

clzw rd, rs

Count leading zero bits in word

cpop rd, rs

Count set bits

cpopw rd, rs

Count set bits in word

ctz rd, rs

Count trailing zero bits

ctzw rd, rs

Count trailing zero bits in word

max rd, rs1, rs2

Maximum

maxu rd, rs1, rs2

Unsigned maximum

min rd, rs1, rs2

Minimum

minu rd, rs1, rs2

Unsigned minimum

orc.b rd, rs1, rs2

Bitwise OR-Combine, byte granule

orn rd, rs1, rs2

OR with inverted operand

rev8 rd, rs

Byte-reverse register

rol rd, rs1, rs2

Rotate left (Register)

rolw rd, rs1, rs2

Rotate Left Word (Register)

ror rd, rs1, rs2

Rotate right (Register)

rori rd, rs1, shamt

Rotate right (Immediate)

roriw rd, rs1, shamt

Rotate right Word (Immediate)

rorw rd, rs1, rs2

Rotate right Word (Register)

bclr rd, rs1, rs2

Single-Bit Clear (Register)

bclri rd, rs1, imm

Single-Bit Clear (Immediate)

bext rd, rs1, rs2

Single-Bit Extract (Register)

bexti rd, rs1, imm

Single-Bit Extract (Immediate)

binv rd, rs1, rs2

Single-Bit Invert (Register)

binvi rd, rs1, imm

Single-Bit Invert (Immediate)

bset rd, rs1, rs2

Single-Bit Set (Register)

bseti rd, rs1, imm

Single-Bit Set (Immediate)

sext.b rd, rs

Sign-extend byte

sext.h rd, rs

Sign-extend halfword

sh1add rd, rs1, rs2

Shift left by 1 and add

sh1add.uw rd, rs1, rs2

Shift unsigned word left by 1 and add

sh2add rd, rs1, rs2

Shift left by 2 and add

sh2add.uw rd, rs1, rs2

Shift unsigned word left by 2 and add

sh3add rd, rs1, rs2

Shift left by 3 and add

sh3add.uw rd, rs1, rs2

Shift unsigned word left by 3 and add

slli.uw rd, rs1, imm

Shift-left unsigned word (Immediate)

xnor rd, rs1, rs2

Exclusive NOR

zext.h rd, rs

Zero-extend halfword

-
-

30.4.1. Zba: Address generation

-
-

The Zba instructions can be used to accelerate the generation of addresses that index into arrays of basic types (halfword, word, doubleword) using both unsigned word-sized and XLEN-sized indices: a shifted index is added to a base address.

-
-
-

The shift and add instructions do a left shift of 1, 2, or 3 because these are commonly found in real-world code and because they can be implemented with a minimal amount of additional hardware beyond that of the simple adder. This avoids lengthening the critical path in implementations.

-
-
-

While the shift and add instructions are limited to a maximum left shift of 3, the slli instruction (from the base ISA) can be used to perform similar shifts for indexing into arrays of wider elements. The slli.uw — added in this extension — can be used when the index is to be interpreted as an unsigned word.

-
-
-

The following instructions (and pseudoinstructions) comprise the Zba extension:

-
- ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
RV32RV64MnemonicInstruction

add.uw rd, rs1, rs2

Add unsigned word

sh1add rd, rs1, rs2

Shift left by 1 and add

sh1add.uw rd, rs1, rs2

Shift unsigned word left by 1 and add

sh2add rd, rs1, rs2

Shift left by 2 and add

sh2add.uw rd, rs1, rs2

Shift unsigned word left by 2 and add

sh3add rd, rs1, rs2

Shift left by 3 and add

sh3add.uw rd, rs1, rs2

Shift unsigned word left by 3 and add

slli.uw rd, rs1, imm

Shift-left unsigned word (Immediate)

zext.w rd, rs

Add unsigned word

-
-
-

30.4.2. Zbb: Basic bit-manipulation

-
-
Logical with negate
- ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
RV32RV64MnemonicInstruction

andn rd, rs1, rs2

AND with inverted operand

orn rd, rs1, rs2

OR with inverted operand

xnor rd, rs1, rs2

Exclusive NOR

-
- - - - - -
- - -
Implementation Hint
-
-

The Logical with Negate instructions can be implemented by inverting the rs2 inputs to the base-required AND, OR, and XOR logic instructions. -In some implementations, the inverter on rs2 used for subtraction can be reused for this purpose.

-
-
-
-
-
-
Count leading/trailing zero bits
- ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
RV32RV64MnemonicInstruction

clz rd, rs

Count leading zero bits

clzw rd, rs

Count leading zero bits in word

ctz rd, rs

Count trailing zero bits

ctzw rd, rs

Count trailing zero bits in word

-
-
-
Count population
-
-

These instructions count the number of set bits (1-bits). This is also -commonly referred to as population count.

-
- ------ - - - - - - - - - - - - - - - - - - - - - - -
RV32RV64MnemonicInstruction

cpop rd, rs

Count set bits

cpopw rd, rs

Count set bits in word

-
-
-
Integer minimum/maximum
-
-

The integer minimum/maximum instructions are arithmetic R-type -instructions that return the smaller/larger of two operands.

-
- ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
RV32RV64MnemonicInstruction

max rd, rs1, rs2

Maximum

maxu rd, rs1, rs2

Unsigned maximum

min rd, rs1, rs2

Minimum

minu rd, rs1, rs2

Unsigned minimum

-
-
-
Sign extension and zero extension
-
-

These instructions perform the sign extension or zero extension of the least significant 8 bits or 16 bits of the source register.

-
-
-

These instructions replace the generalized idioms slli rD,rS,(XLEN-<size>) + srli (for zero extension) or slli + srai (for sign extension) for the sign extension of 8-bit and 16-bit quantities, and for the zero extension of 16-bit quantities.

-
- ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
RV32RV64MnemonicInstruction

sext.b rd, rs

Sign-extend byte

sext.h rd, rs

Sign-extend halfword

zext.h rd, rs

Zero-extend halfword

-
-
-
Bitwise rotation
-
-

Bitwise rotation instructions are similar to the shift-logical operations from the base spec. However, where the shift-logical -instructions shift in zeros, the rotate instructions shift in the bits that were shifted out of the other side of the value. -Such operations are also referred to as ‘circular shifts’.

-
- ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
RV32RV64MnemonicInstruction

rol rd, rs1, rs2

Rotate left (Register)

rolw rd, rs1, rs2

Rotate Left Word (Register)

ror rd, rs1, rs2

Rotate right (Register)

rori rd, rs1, shamt

Rotate right (Immediate)

roriw rd, rs1, shamt

Rotate right Word (Immediate)

rorw rd, rs1, rs2

Rotate right Word (Register)

-
- - - - - -
- - -
Architecture Explanation
-
-

The rotate instructions were included to replace a common -four-instruction sequence to achieve the same effect (neg; sll/srl; srl/sll; or)

-
-
-
-
-
-
OR Combine
-
-

orc.b sets the bits of each byte in the result rd to all zeros if no bit within the respective byte of rs is set, or to all ones if any bit within the respective byte of rs is set.

-
-
-

One use-case is string-processing functions, such as strlen and strcpy, which can use orc.b to test for the terminating zero byte by counting the set bits in leading non-zero bytes in a word.

-
- ------ - - - - - - - - - - - - - - - - -
RV32RV64MnemonicInstruction

orc.b rd, rs

Bitwise OR-Combine, byte granule

-
-
-
Byte-reverse
-
-

rev8 reverses the byte-ordering of rs.

-
- ------ - - - - - - - - - - - - - - - - -
RV32RV64MnemonicInstruction

rev8 rd, rs

Byte-reverse register

-
-
-
-

30.4.3. Zbc: Carry-less multiplication

-
-

Carry-less multiplication is the multiplication in the polynomial ring over GF(2).

-
-
-

clmul produces the lower half of the carry-less product and clmulh produces the upper half of the 2✕XLEN carry-less product.

-
-
-

clmulr produces bits 2✕XLEN−2:XLEN-1 of the 2✕XLEN carry-less product.

-
- ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
RV32RV64MnemonicInstruction

clmul rd, rs1, rs2

Carry-less multiply (low-part)

clmulh rd, rs1, rs2

Carry-less multiply (high-part)

clmulr rd, rs1, rs2

Carry-less multiply (reversed)

-
-
-

30.4.4. Zbs: Single-bit instructions

-
-

The single-bit instructions provide a mechanism to set, clear, invert, or extract -a single bit in a register. The bit is specified by its index.

-
- ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
RV32RV64MnemonicInstruction

bclr rd, rs1, rs2

Single-Bit Clear (Register)

bclri rd, rs1, imm

Single-Bit Clear (Immediate)

bext rd, rs1, rs2

Single-Bit Extract (Register)

bexti rd, rs1, imm

Single-Bit Extract (Immediate)

binv rd, rs1, rs2

Single-Bit Invert (Register)

binvi rd, rs1, imm

Single-Bit Invert (Immediate)

bset rd, rs1, rs2

Single-Bit Set (Register)

bseti rd, rs1, imm

Single-Bit Set (Immediate)

-
-
-

30.4.5. Zbkb: Bit-manipulation for Cryptography

-
-

This extension contains instructions essential for implementing -common operations in cryptographic workloads.

-
- ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
RV32RV64MnemonicInstruction

rol

Rotate left (Register)

rolw

Rotate Left Word (Register)

ror

Rotate right (Register)

rori

Rotate right (Immediate)

roriw

Rotate right Word (Immediate)

rorw

Rotate right Word (Register)

andn

AND with inverted operand

orn

OR with inverted operand

xnor

Exclusive NOR

pack

Pack low halves of registers

packh

Pack low bytes of registers

packw

Pack low 16-bits of registers (RV64)

rev.b

Reverse bits in bytes

rev8

Byte-reverse register

zip

Bit interleave

unzip

Bit deinterleave

-
-
-

30.4.6. Zbkc: Carry-less multiplication for Cryptography

-
-

Carry-less multiplication is the multiplication in the polynomial ring over -GF(2). This is a critical operation in some cryptographic workloads, -particularly the AES-GCM authenticated encryption scheme. -This extension provides only the instructions needed to -efficiently implement the GHASH operation, which is part of this workload.

-
- ------ - - - - - - - - - - - - - - - - - - - - - - -
RV32RV64MnemonicInstruction

clmul rd, rs1, rs2

Carry-less multiply (low-part)

clmulh rd, rs1, rs2

Carry-less multiply (high-part)

-
-
-

30.4.7. Zbkx: Crossbar permutations

-
-

These instructions implement a "lookup table" for 4 and 8 bit elements -inside the general purpose registers. -rs1 is used as a vector of N-bit words, and rs2 as a vector of N-bit -indices into rs1. -Elements in rs1 are replaced by the indexed element in rs2, or zero -if the index into rs2 is out of bounds.

-
-
-

These instructions are useful for expressing N-bit to N-bit boolean -operations, and implementing cryptographic code with secret -dependent memory accesses (particularly SBoxes) such that the execution -latency does not depend on the (secret) data being operated on.

-
- ------ - - - - - - - - - - - - - - - - - - - - - - -
RV32RV64MnemonicInstruction

xperm.n rd, rs1, rs2

Crossbar permutation (nibbles)

xperm.b rd, rs1, rs2

Crossbar permutation (bytes)

-
-
-
-
-

30.5. Instructions (in alphabetical order)

-
-

30.5.1. add.uw

-
-
-
Synopsis
-
-

Add unsigned word

-
-
Mnemonic
-
-

add.uw rd, rs1, rs2

-
-
Pseudoinstructions
-
-

zext.w rd, rs1 → add.uw rd, rs1, zero

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction performs an XLEN-wide addition between rs2 and the zero-extended least-significant word of rs1.

-
-
Operation
-
-
-
-
-
let base = X(rs2);
-let index = EXTZ(X(rs1)[31..0]);
-
-X(rd) = base + index;
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zba (Address generation instructions)

0.93

Ratified

-
-
-
-

30.5.2. andn

-
-
-
Synopsis
-
-

AND with inverted operand

-
-
Mnemonic
-
-

andn rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction performs the bitwise logical AND operation between rs1 and the bitwise inversion of rs2.

-
-
Operation
-
-
-
-
-
X(rd) = X(rs1) & ~X(rs2);
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

v1.0

Ratified

Zbkb (Bit-manipulation for Cryptography)

v1.0

Ratified

-
-
-
-

30.5.3. bclr

-
-
-
Synopsis
-
-

Single-Bit Clear (Register)

-
-
Mnemonic
-
-

bclr rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction returns rs1 with a single bit cleared at the index specified in rs2. -The index is read from the lower log2(XLEN) bits of rs2.

-
-
Operation
-
-
-
-
-
let index = X(rs2) & (XLEN - 1);
-X(rd) = X(rs1) & ~(1 << index)
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbs (Single-bit instructions)

v1.0

Ratified

-
-
-
-

30.5.4. bclri

-
-
-
Synopsis
-
-

Single-Bit Clear (Immediate)

-
-
Mnemonic
-
-

bclri rd, rs1, shamt

-
-
Encoding (RV32)
-
-
-
-
-Diagram -
-
-
-
-
Encoding (RV64)
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction returns rs1 with a single bit cleared at the index specified in shamt. -The index is read from the lower log2(XLEN) bits of shamt. -For RV32, the encodings corresponding to shamt[5]=1 are reserved.

-
-
Operation
-
-
-
-
-
let index = shamt & (XLEN - 1);
-X(rd) = X(rs1) & ~(1 << index)
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbs (Single-bit instructions)

v1.0

Ratified

-
-
-
-

30.5.5. bext

-
-
-
Synopsis
-
-

Single-Bit Extract (Register)

-
-
Mnemonic
-
-

bext rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction returns a single bit extracted from rs1 at the index specified in rs2. -The index is read from the lower log2(XLEN) bits of rs2.

-
-
Operation
-
-
-
-
-
let index = X(rs2) & (XLEN - 1);
-X(rd) = (X(rs1) >> index) & 1;
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbs (Single-bit instructions)

v1.0

Ratified

-
-
-
-

30.5.6. bexti

-
-
-
Synopsis
-
-

Single-Bit Extract (Immediate)

-
-
Mnemonic
-
-

bexti rd, rs1, shamt

-
-
Encoding (RV32)
-
-
-
-
-Diagram -
-
-
-
-
Encoding (RV64)
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction returns a single bit extracted from rs1 at the index specified in rs2. -The index is read from the lower log2(XLEN) bits of shamt. -For RV32, the encodings corresponding to shamt[5]=1 are reserved.

-
-
Operation
-
-
-
-
-
let index = shamt & (XLEN - 1);
-X(rd) = (X(rs1) >> index) & 1;
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbs (Single-bit instructions)

v1.0

Ratified

-
-
-
-

30.5.7. binv

-
-
-
Synopsis
-
-

Single-Bit Invert (Register)

-
-
Mnemonic
-
-

binv rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction returns rs1 with a single bit inverted at the index specified in rs2. -The index is read from the lower log2(XLEN) bits of rs2.

-
-
Operation
-
-
-
-
-
let index = X(rs2) & (XLEN - 1);
-X(rd) = X(rs1) ^ (1 << index)
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbs (Single-bit instructions)

v1.0

Ratified

-
-
-
-

30.5.8. binvi

-
-
-
Synopsis
-
-

Single-Bit Invert (Immediate)

-
-
Mnemonic
-
-

binvi rd, rs1, shamt

-
-
Encoding (RV32)
-
-
-
-
-Diagram -
-
-
-
-
Encoding (RV64)
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction returns rs1 with a single bit inverted at the index specified in shamt. -The index is read from the lower log2(XLEN) bits of shamt. -For RV32, the encodings corresponding to shamt[5]=1 are reserved.

-
-
Operation
-
-
-
-
-
let index = shamt & (XLEN - 1);
-X(rd) = X(rs1) ^ (1 << index)
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbs (Single-bit instructions)

v1.0

Ratified

-
-
-
-

30.5.9. bset

-
-
-
Synopsis
-
-

Single-Bit Set (Register)

-
-
Mnemonic
-
-

bset rd, rs1,rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction returns rs1 with a single bit set at the index specified in rs2. -The index is read from the lower log2(XLEN) bits of rs2.

-
-
Operation
-
-
-
-
-
let index = X(rs2) & (XLEN - 1);
-X(rd) = X(rs1) | (1 << index)
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbs (Single-bit instructions)

v1.0

Ratified

-
-
-
-

30.5.10. bseti

-
-
-
Synopsis
-
-

Single-Bit Set (Immediate)

-
-
Mnemonic
-
-

bseti rd, rs1,shamt

-
-
Encoding (RV32)
-
-
-
-
-Diagram -
-
-
-
-
Encoding (RV64)
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction returns rs1 with a single bit set at the index specified in shamt. -The index is read from the lower log2(XLEN) bits of shamt. -For RV32, the encodings corresponding to shamt[5]=1 are reserved.

-
-
Operation
-
-
-
-
-
let index = shamt & (XLEN - 1);
-X(rd) = X(rs1) | (1 << index)
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbs (Single-bit instructions)

v1.0

Ratified

-
-
-
-

30.5.11. clmul

-
-
-
Synopsis
-
-

Carry-less multiply (low-part)

-
-
Mnemonic
-
-

clmul rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

clmul produces the lower half of the 2·XLEN carry-less product.

-
-
Operation
-
-
-
-
-
let rs1_val = X(rs1);
-let rs2_val = X(rs2);
-let output : xlenbits = 0;
-
-foreach (i from 0 to (xlen - 1) by 1) {
-   output = if   ((rs2_val >> i) & 1)
-            then output ^ (rs1_val << i);
-            else output;
-}
-
-X[rd] = output
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbc (Carry-less multiplication)

v1.0

Ratified

Zbkc (Carry-less multiplication for Cryptography)

v1.0

Ratified

-
-
-
-

30.5.12. clmulh

-
-
-
Synopsis
-
-

Carry-less multiply (high-part)

-
-
Mnemonic
-
-

clmulh rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

clmulh produces the upper half of the 2·XLEN carry-less product.

-
-
Operation
-
-
-
-
-
let rs1_val = X(rs1);
-let rs2_val = X(rs2);
-let output : xlenbits = 0;
-
-foreach (i from 1 to xlen by 1) {
-   output = if   ((rs2_val >> i) & 1)
-            then output ^ (rs1_val >> (xlen - i));
-            else output;
-}
-
-X[rd] = output
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbc (Carry-less multiplication)

v1.0

Ratified

Zbkc (Carry-less multiplication for Cryptography)

v1.0

Ratified

-
-
-
-

30.5.13. clmulr

-
-
-
Synopsis
-
-

Carry-less multiply (reversed)

-
-
Mnemonic
-
-

clmulr rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

clmulr produces bits 2·XLEN−2:XLEN-1 of the 2·XLEN carry-less -product.

-
-
Operation
-
-
-
-
-
let rs1_val = X(rs1);
-let rs2_val = X(rs2);
-let output : xlenbits = 0;
-
-foreach (i from 0 to (xlen - 1) by 1) {
-   output = if   ((rs2_val >> i) & 1)
-            then output ^ (rs1_val >> (xlen - i - 1));
-            else output;
-}
-
-X[rd] = output
-
-
-
- - - - - -
- - -
Note
-
-

The clmulr instruction is used to accelerate CRC calculations. -The r in the instruction’s mnemonic stands for reversed, as the -instruction is equivalent to bit-reversing the inputs, performing -a clmul, then bit-reversing the output.

-
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbc (Carry-less multiplication)

v1.0

Ratified

-
-
-
-

30.5.14. clz

-
-
-
Synopsis
-
-

Count leading zero bits

-
-
Mnemonic
-
-

clz rd, rs

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction counts the number of 0’s before the first 1, starting at the most-significant bit (i.e., XLEN-1) and progressing to bit 0. Accordingly, if the input is 0, the output is XLEN, and if the most-significant bit of the input is a 1, the output is 0.

-
-
Operation
-
-
-
-
-
val HighestSetBit : forall ('N : Int), 'N >= 0. bits('N) -> int
-
-function HighestSetBit x = {
-  foreach (i from (xlen - 1) to 0 by 1 in dec)
-    if [x[i]] == 0b1 then return(i) else ();
-  return -1;
-}
-
-let rs = X(rs);
-X[rd] = (xlen - 1) - HighestSetBit(rs);
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

v1.0

Ratified

-
-
-
-

30.5.15. clzw

-
-
-
Synopsis
-
-

Count leading zero bits in word

-
-
Mnemonic
-
-

clzw rd, rs

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction counts the number of 0’s before the first 1 starting at bit 31 and progressing to bit 0. -Accordingly, if the least-significant word is 0, the output is 32, and if the most-significant bit of the word (i.e., bit 31) is a 1, the output is 0.

-
-
Operation
-
-
-
-
-
val HighestSetBit32 : forall ('N : Int), 'N >= 0. bits('N) -> int
-
-function HighestSetBit32 x = {
-  foreach (i from 31 to 0 by 1 in dec)
-    if [x[i]] == 0b1 then return(i) else ();
-  return -1;
-}
-
-let rs = X(rs);
-X[rd] = 31 - HighestSetBit(rs);
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

v1.0

Ratified

-
-
-
-

30.5.16. cpop

-
-
-
Synopsis
-
-

Count set bits

-
-
Mnemonic
-
-

cpop rd, rs

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instructions counts the number of 1’s (i.e., set bits) in the source register.

-
-
Operation
-
-
-
-
-
let bitcount = 0;
-let rs = X(rs);
-
-foreach (i from 0 to (xlen - 1) in inc)
-    if rs[i] == 0b1 then bitcount = bitcount + 1 else ();
-
-X[rd] = bitcount
-
-
-
- - - - - -
- - -
Software Hint
-
-

This operations is known as population count, popcount, sideways sum, bit summation, or Hamming weight.

-
-
-

The GCC builtin function __builtin_popcount (unsigned int x) is implemented by cpop on RV32 and by cpopw on RV64. -The GCC builtin function __builtin_popcountl (unsigned long x) for LP64 is implemented by cpop on RV64.

-
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

v1.0

Ratified

-
-
-
-

30.5.17. cpopw

-
-
-
Synopsis
-
-

Count set bits in word

-
-
Mnemonic
-
-

cpopw rd, rs

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instructions counts the number of 1’s (i.e., set bits) in the least-significant word of the source register.

-
-
Operation
-
-
-
-
-
let bitcount = 0;
-let val = X(rs);
-
-foreach (i from 0 to 31 in inc)
-    if val[i] == 0b1 then bitcount = bitcount + 1 else ();
-
-X[rd] = bitcount
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

v1.0

Ratified

-
-
-
-

30.5.18. ctz

-
-
-
Synopsis
-
-

Count trailing zeros

-
-
Mnemonic
-
-

ctz rd, rs

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction counts the number of 0’s before the first 1, starting at the least-significant bit (i.e., 0) and progressing to the most-significant bit (i.e., XLEN-1). -Accordingly, if the input is 0, the output is XLEN, and if the least-significant bit of the input is a 1, the output is 0.

-
-
Operation
-
-
-
-
-
val LowestSetBit : forall ('N : Int), 'N >= 0. bits('N) -> int
-
-function LowestSetBit x = {
-  foreach (i from 0 to (xlen - 1) by 1 in dec)
-    if [x[i]] == 0b1 then return(i) else ();
-  return xlen;
-}
-
-let rs = X(rs);
-X[rd] = LowestSetBit(rs);
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

v1.0

Ratified

-
-
-
-

30.5.19. ctzw

-
-
-
Synopsis
-
-

Count trailing zero bits in word

-
-
Mnemonic
-
-

ctzw rd, rs

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction counts the number of 0’s before the first 1, starting at the least-significant bit (i.e., 0) and progressing to the most-significant bit of the least-significant word (i.e., 31). Accordingly, if the least-significant word is 0, the output is 32, and if the least-significant bit of the input is a 1, the output is 0.

-
-
Operation
-
-
-
-
-
val LowestSetBit32 : forall ('N : Int), 'N >= 0. bits('N) -> int
-
-function LowestSetBit32 x = {
-  foreach (i from 0 to 31 by 1 in dec)
-    if [x[i]] == 0b1 then return(i) else ();
-  return 32;
-}
-
-let rs = X(rs);
-X[rd] = LowestSetBit32(rs);
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

v1.0

Ratified

-
-
-
-

30.5.20. max

-
-
-
Synopsis
-
-

Maximum

-
-
Mnemonic
-
-

max rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction returns the larger of two signed integers.

-
-
Operation
-
-
-
-
-
let rs1_val = X(rs1);
-let rs2_val = X(rs2);
-
-let result = if   rs1_val <_s rs2_val
-             then rs2_val
-             else rs1_val;
-
-X(rd) = result;
-
-
-
- - - - - -
- - -
Software Hint
-
-

Calculating the absolute value of a signed integer can be performed -using the following sequence: neg rD,rS followed by max -rD,rS,rD. When using this common sequence, it is suggested that they -are scheduled with no intervening instructions so that -implementations that are so optimized can fuse them together.

-
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

v1.0

Ratified

-
-
-
-

30.5.21. maxu

-
-
-
Synopsis
-
-

Unsigned maximum

-
-
Mnemonic
-
-

maxu rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction returns the larger of two unsigned integers.

-
-
Operation
-
-
-
-
-
let rs1_val = X(rs1);
-let rs2_val = X(rs2);
-
-let result = if   rs1_val <_u rs2_val
-             then rs2_val
-             else rs1_val;
-
-X(rd) = result;
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

v1.0

Ratified

-
-
-
-

30.5.22. min

-
-
-
Synopsis
-
-

Minimum

-
-
Mnemonic
-
-

min rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction returns the smaller of two signed integers.

-
-
Operation
-
-
-
-
-
let rs1_val = X(rs1);
-let rs2_val = X(rs2);
-
-let result = if   rs1_val <_s rs2_val
-             then rs1_val
-             else rs2_val;
-
-X(rd) = result;
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

v1.0

Ratified

-
-
-
-

30.5.23. minu

-
-
-
Synopsis
-
-

Unsigned minimum

-
-
Mnemonic
-
-

minu rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction returns the smaller of two unsigned integers.

-
-
Operation
-
-
-
-
-
let rs1_val = X(rs1);
-let rs2_val = X(rs2);
-
-let result = if   rs1_val <_u rs2_val
-             then rs1_val
-             else rs2_val;
-
-X(rd) = result;
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

v1.0

Ratified

-
-
-
-

30.5.24. orc.b

-
-
-
Synopsis
-
-

Bitwise OR-Combine, byte granule

-
-
Mnemonic
-
-

orc.b rd, rs

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

Combines the bits within each byte using bitwise logical OR. -This sets the bits of each byte in the result rd to all zeros if no bit within the respective byte of rs is set, or to all ones if any bit within the respective byte of rs is set.

-
-
Operation
-
-
-
-
-
let input = X(rs);
-let output : xlenbits = 0;
-
-foreach (i from 0 to (xlen - 8) by 8) {
-   output[(i + 7)..i] = if   input[(i + 7)..i] == 0
-                        then 0b00000000
-                        else 0b11111111;
-}
-
-X[rd] = output;
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

v1.0

Ratified

-
-
-
-

30.5.25. orn

-
-
-
Synopsis
-
-

OR with inverted operand

-
-
Mnemonic
-
-

orn rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction performs the bitwise logical OR operation between rs1 and the bitwise inversion of rs2.

-
-
Operation
-
-
-
-
-
X(rd) = X(rs1) | ~X(rs2);
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

v1.0

Ratified

Zbkb (Bit-manipulation for Cryptography)

v1.0

Ratified

-
-
-
-

30.5.26. pack

-
-
-
Synopsis
-
-

Pack the low halves of rs1 and rs2 into rd.

-
-
Mnemonic
-
-

pack rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

The pack instruction packs the XLEN/2-bit lower halves of rs1 and rs2 into -rd, with rs1 in the lower half and rs2 in the upper half.

-
-
Operation
-
-
-
-
-
let lo_half : bits(xlen/2) = X(rs1)[xlen/2-1..0];
-let hi_half : bits(xlen/2) = X(rs2)[xlen/2-1..0];
-X(rd) = EXTZ(hi_half @ lo_half);
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbkb (Bit-manipulation for Cryptography)

v1.0

Ratified

-
-
-
-

30.5.27. packh

-
-
-
Synopsis
-
-

Pack the low bytes of rs1 and rs2 into rd.

-
-
Mnemonic
-
-

packh rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

And the packh instruction packs the least-significant bytes of -rs1 and rs2 into the 16 least-significant bits of rd, -zero extending the rest of rd.

-
-
Operation
-
-
-
-
-
let lo_half : bits(8) = X(rs1)[7..0];
-let hi_half : bits(8) = X(rs2)[7..0];
-X(rd) = EXTZ(hi_half @ lo_half);
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbkb (Bit-manipulation for Cryptography)

v1.0

Ratified

-
-
-
-

30.5.28. packw

-
-
-
Synopsis
-
-

Pack the low 16-bits of rs1 and rs2 into rd on RV64.

-
-
Mnemonic
-
-

packw rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction packs the low 16 bits of -rs1 and rs2 into the 32 least-significant bits of rd, -sign extending the 32-bit result to the rest of rd. -This instruction only exists on RV64 based systems.

-
-
Operation
-
-
-
-
-
let lo_half : bits(16) = X(rs1)[15..0];
-let hi_half : bits(16) = X(rs2)[15..0];
-X(rd) = EXTS(hi_half @ lo_half);
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbkb (Bit-manipulation for Cryptography)

v1.0

Ratified

-
-
-
-

30.5.29. rev8

-
-
-
Synopsis
-
-

Byte-reverse register

-
-
Mnemonic
-
-

rev8 rd, rs

-
-
Encoding (RV32)
-
-
-
-
-Diagram -
-
-
-
-
Encoding (RV64)
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction reverses the order of the bytes in rs.

-
-
Operation
-
-
-
-
-
let input = X(rs);
-let output : xlenbits = 0;
-let j = xlen - 1;
-
-foreach (i from 0 to (xlen - 8) by 8) {
-   output[i..(i + 7)] = input[(j - 7)..j];
-   j = j - 8;
-}
-
-X[rd] = output
-
-
-
- - - - - -
- - -
Note
-
-

The rev8 mnemonic corresponds to different instruction encodings in RV32 and RV64.

-
-
-
-
- - - - - -
- - -
Software Hint
-
-

The byte-reverse operation is only available for the full register -width. To emulate word-sized and halfword-sized byte-reversal, -perform a rev8 rd,rs followed by a srai rd,rd,K, where K is -XLEN-32 and XLEN-16, respectively.

-
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

v1.0

Ratified

Zbkb (Bit-manipulation for Cryptography)

v1.0

Ratified

-
-
-
-

30.5.30. rev.b

-
-
-
Synopsis
-
-

Reverse the bits in each byte of a source register.

-
-
Mnemonic
-
-

rev.b rd, rs

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction reverses the order of the bits in every byte of a register.

-
-
Operation
-
-
-
-
-
result : xlenbits = EXTZ(0b0);
-foreach (i from 0 to sizeof(xlen) by 8) {
-    result[i+7..i] = reverse_bits_in_byte(X(rs1)[i+7..i]);
-};
-X(rd) = result;
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbkb (Bit-manipulation for Cryptography)

v1.0

Ratified

-
-
-
-

30.5.31. rol

-
-
-
Synopsis
-
-

Rotate Left (Register)

-
-
Mnemonic
-
-

rol rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction performs a rotate left of rs1 by the amount in least-significant log2(XLEN) bits of rs2.

-
-
Operation
-
-
-
-
-
let shamt = if   xlen == 32
-            then X(rs2)[4..0]
-            else X(rs2)[5..0];
-let result = (X(rs1) << shamt) | (X(rs1) >> (xlen - shamt));
-
-X(rd) = result;
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

0.93

Ratified

Zbkb (Bit-manipulation for Cryptography)

v1.0

Ratified

-
-
-
-

30.5.32. rolw

-
-
-
Synopsis
-
-

Rotate Left Word (Register)

-
-
Mnemonic
-
-

rolw rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction performs a rotate left on the least-significant word of rs1 by the amount in least-significant 5 bits of rs2. -The resulting word value is sign-extended by copying bit 31 to all of the more-significant bits.

-
-
Operation
-
-
-
-
-
let rs1 = EXTZ(X(rs1)[31..0])
-let shamt = X(rs2)[4..0];
-let result = (rs1 << shamt) | (rs1 >> (32 - shamt));
-X(rd) = EXTS(result[31..0]);
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

0.93

Ratified

Zbkb (Bit-manipulation for Cryptography)

v1.0

Ratified

-
-
-
-

30.5.33. ror

-
-
-
Synopsis
-
-

Rotate Right

-
-
Mnemonic
-
-

ror rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction performs a rotate right of rs1 by the amount in least-significant log2(XLEN) bits of rs2.

-
-
Operation
-
-
-
-
-
let shamt = if   xlen == 32
-            then X(rs2)[4..0]
-            else X(rs2)[5..0];
-let result = (X(rs1) >> shamt) | (X(rs1) << (xlen - shamt));
-
-X(rd) = result;
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

0.93

Ratified

Zbkb (Bit-manipulation for Cryptography)

v1.0

Ratified

-
-
-
-

30.5.34. rori

-
-
-
Synopsis
-
-

Rotate Right (Immediate)

-
-
Mnemonic
-
-

rori rd, rs1, shamt

-
-
Encoding (RV32)
-
-
-
-
-Diagram -
-
-
-
-
Encoding (RV64)
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction performs a rotate right of rs1 by the amount in the least-significant log2(XLEN) bits of shamt. -For RV32, the encodings corresponding to shamt[5]=1 are reserved.

-
-
Operation
-
-
-
-
-
let shamt = if   xlen == 32
-            then shamt[4..0]
-            else shamt[5..0];
-let result = (X(rs1) >> shamt) | (X(rs1) << (xlen - shamt));
-
-X(rd) = result;
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

0.93

Ratified

Zbkb (Bit-manipulation for Cryptography)

v1.0

Ratified

-
-
-
-

30.5.35. roriw

-
-
-
Synopsis
-
-

Rotate Right Word by Immediate

-
-
Mnemonic
-
-

roriw rd, rs1, shamt

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction performs a rotate right on the least-significant word -of rs1 by the amount in the least-significant log2(XLEN) bits of -shamt. -The resulting word value is sign-extended by copying bit 31 to all of -the more-significant bits.

-
-
Operation
-
-
-
-
-
let rs1_data = EXTZ(X(rs1)[31..0];
-let result = (rs1_data >> shamt) | (rs1_data << (32 - shamt));
-X(rd) = EXTS(result[31..0]);
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

0.93

Ratified

Zbkb (Bit-manipulation for Cryptography)

v1.0

Ratified

-
-
-
-

30.5.36. rorw

-
-
-
Synopsis
-
-

Rotate Right Word (Register)

-
-
Mnemonic
-
-

rorw rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction performs a rotate right on the least-significant word of rs1 by the amount in least-significant 5 bits of rs2. -The resultant word is sign-extended by copying bit 31 to all of the more-significant bits.

-
-
Operation
-
-
-
-
-
let rs1 = EXTZ(X(rs1)[31..0])
-let shamt = X(rs2)[4..0];
-let result = (rs1 >> shamt) | (rs1 << (32 - shamt));
-X(rd) = EXTS(result);
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

0.93

Ratified

Zbkb (Bit-manipulation for Cryptography)

v1.0

Ratified

-
-
-
-

30.5.37. sext.b

-
-
-
Synopsis
-
-

Sign-extend byte

-
-
Mnemonic
-
-

sext.b rd, rs

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction sign-extends the least-significant byte in the source to XLEN by copying the most-significant bit in the byte (i.e., bit 7) to all of the more-significant bits.

-
-
Operation
-
-
-
-
-
X(rd) = EXTS(X(rs)[7..0]);
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

0.93

Ratified

-
-
-
-

30.5.38. sext.h

-
-
-
Synopsis
-
-

Sign-extend halfword

-
-
Mnemonic
-
-

sext.h rd, rs

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction sign-extends the least-significant halfword in rs to XLEN by copying the most-significant bit in the halfword (i.e., bit 15) to all of the more-significant bits.

-
-
Operation
-
-
-
-
-
X(rd) = EXTS(X(rs)[15..0]);
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

0.93

Ratified

-
-
-
-

30.5.39. sh1add

-
-
-
Synopsis
-
-

Shift left by 1 and add

-
-
Mnemonic
-
-

sh1add rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction shifts rs1 to the left by 1 bit and adds it to rs2.

-
-
Operation
-
-
-
-
-
X(rd) = X(rs2) + (X(rs1) << 1);
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zba (Address generation instructions)

0.93

Ratified

-
-
-
-

30.5.40. sh1add.uw

-
-
-
Synopsis
-
-

Shift unsigned word left by 1 and add

-
-
Mnemonic
-
-

sh1add.uw rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction performs an XLEN-wide addition of two addends. -The first addend is rs2. The second addend is the unsigned value formed by extracting the least-significant word of rs1 and shifting it left by 1 place.

-
-
Operation
-
-
-
-
-
let base = X(rs2);
-let index = EXTZ(X(rs1)[31..0]);
-
-X(rd) = base + (index << 1);
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zba (Address generation instructions)

0.93

Ratified

-
-
-
-

30.5.41. sh2add

-
-
-
Synopsis
-
-

Shift left by 2 and add

-
-
Mnemonic
-
-

sh2add rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction shifts rs1 to the left by 2 places and adds it to rs2.

-
-
Operation
-
-
-
-
-
X(rd) = X(rs2) + (X(rs1) << 2);
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zba (Address generation instructions)

0.93

Ratified

-
-
-
-

30.5.42. sh2add.uw

-
-
-
Synopsis
-
-

Shift unsigned word left by 2 and add

-
-
Mnemonic
-
-

sh2add.uw rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction performs an XLEN-wide addition of two addends. -The first addend is rs2. -The second addend is the unsigned value formed by extracting the least-significant word of rs1 and shifting it left by 2 places.

-
-
Operation
-
-
-
-
-
let base = X(rs2);
-let index = EXTZ(X(rs1)[31..0]);
-
-X(rd) = base + (index << 2);
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zba (Address generation instructions)

0.93

Ratified

-
-
-
-

30.5.43. sh3add

-
-
-
Synopsis
-
-

Shift left by 3 and add

-
-
Mnemonic
-
-

sh3add rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction shifts rs1 to the left by 3 places and adds it to rs2.

-
-
Operation
-
-
-
-
-
X(rd) = X(rs2) + (X(rs1) << 3);
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zba (Address generation instructions)

0.93

Ratified

-
-
-
-

30.5.44. sh3add.uw

-
-
-
Synopsis
-
-

Shift unsigned word left by 3 and add

-
-
Mnemonic
-
-

sh3add.uw rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction performs an XLEN-wide addition of two addends. The first addend is rs2. The second addend is the unsigned value formed by extracting the least-significant word of rs1 and shifting it left by 3 places.

-
-
Operation
-
-
-
-
-
let base = X(rs2);
-let index = EXTZ(X(rs1)[31..0]);
-
-X(rd) = base + (index << 3);
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zba (Address generation instructions)

0.93

Ratified

-
-
-
-

30.5.45. slli.uw

-
-
-
Synopsis
-
-

Shift-left unsigned word (Immediate)

-
-
Mnemonic
-
-

slli.uw rd, rs1, shamt

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction takes the least-significant word of rs1, zero-extends it, and shifts it left by the immediate.

-
-
Operation
-
-
-
-
-
X(rd) = (EXTZ(X(rs)[31..0]) << shamt);
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zba (Address generation instructions)

0.93

Ratified

-
- - - - - -
- - -
Architecture Explanation
-
-

This instruction is the same as slli with zext.w performed on rs1 before shifting.

-
-
-
-
-
-
-

30.5.46. unzip

-
-
-
Synopsis
-
-

Implements the inverse of the zip instruction.

-
-
Mnemonic
-
-

unzip rd, rs

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction gathers bits from the high and low halves of the source -word into odd/even bit positions in the destination word. -It is the inverse of the zip instruction. -This instruction is available only on RV32.

-
-
Operation
-
-
-
-
-
foreach (i from 0 to xlen/2-1) {
-  X(rd)[i] = X(rs1)[2*i]
-  X(rd)[i+xlen/2] = X(rs1)[2*i+1]
-}
-
-
-
- - - - - -
- - -
Software Hint
-
-

This instruction is useful for implementing the SHA3 cryptographic -hash function on a 32-bit architecture, as it implements the -bit-interleaving operation used to speed up the 64-bit rotations -directly.

-
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbkb (Bit-manipulation for Cryptography) (RV32)

v1.0

Ratified

-
-
-
-

30.5.47. xnor

-
-
-
Synopsis
-
-

Exclusive NOR

-
-
Mnemonic
-
-

xnor rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction performs the bit-wise exclusive-NOR operation on rs1 and rs2.

-
-
Operation
-
-
-
-
-
X(rd) = ~(X(rs1) ^ X(rs2));
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

0.93

Ratified

Zbkb (Bit-manipulation for Cryptography)

v1.0

Ratified

-
-
-
-

30.5.48. xperm.b

-
-
-
Synopsis
-
-

Byte-wise lookup of indices into a vector in registers.

-
-
Mnemonic
-
-

xperm.b rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

The xperm.b instruction operates on bytes. -The rs1 register contains a vector of XLEN/8 8-bit elements. -The rs2 register contains a vector of XLEN/8 8-bit indexes. -The result is each element in rs2 replaced by the indexed element in rs1, -or zero if the index into rs2 is out of bounds.

-
-
Operation
-
-
-
-
-
val xpermb_lookup : (bits(8), xlenbits) -> bits(8)
-function xpermb_lookup (idx, lut) = {
-    (lut >> (idx @ 0b000))[7..0]
-}
-
-function clause execute ( XPERM_B (rs2,rs1,rd)) = {
-    result : xlenbits = EXTZ(0b0);
-    foreach(i from 0 to xlen by 8) {
-        result[i+7..i] = xpermn_lookup(X(rs2)[i+7..i], X(rs1));
-    };
-    X(rd) = result;
-    RETIRE_SUCCESS
-}
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbkx (Crossbar permutations)

v1.0

Ratified

-
-
-
-

30.5.49. xperm.n

-
-
-
Synopsis
-
-

Nibble-wise lookup of indices into a vector.

-
-
Mnemonic
-
-

xperm.n rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

The xperm.n instruction operates on nibbles. -The rs1 register contains a vector of XLEN/4 4-bit elements. -The rs2 register contains a vector of XLEN/4 4-bit indexes. -The result is each element in rs2 replaced by the indexed element in rs1, -or zero if the index into rs2 is out of bounds.

-
-
Operation
-
-
-
-
-
val xpermn_lookup : (bits(4), xlenbits) -> bits(4)
-function xpermn_lookup (idx, lut) = {
-    (lut >> (idx @ 0b00))[3..0]
-}
-
-function clause execute ( XPERM_N (rs2,rs1,rd)) = {
-    result : xlenbits = EXTZ(0b0);
-    foreach(i from 0 to xlen by 4) {
-        result[i+3..i] = xpermn_lookup(X(rs2)[i+3..i], X(rs1));
-    };
-    X(rd) = result;
-    RETIRE_SUCCESS
-}
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbkx (Crossbar permutations)

v1.0

Ratified

-
-
-
-

30.5.50. zext.h

-
-
-
Synopsis
-
-

Zero-extend halfword

-
-
Mnemonic
-
-

zext.h rd, rs

-
-
Encoding (RV32)
-
-
-
-
-Diagram -
-
-
-
-
Encoding (RV64)
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction zero-extends the least-significant halfword of the source to XLEN by inserting 0’s into all of the bits more significant than 15.

-
-
Operation
-
-
-
-
-
X(rd) = EXTZ(X(rs)[15..0]);
-
-
-
- - - - - -
- - -
Note
-
-

The zext.h mnemonic corresponds to different instruction encodings in RV32 and RV64.

-
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

0.93

Ratified

-
-
-
-

30.5.51. zip

-
-
-
Synopsis
-
-

Gather odd and even bits of the source word into upper/lower halves of the -destination.

-
-
Mnemonic
-
-

zip rd, rs

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction scatters all of the odd and even bits of a source word into -the high and low halves of a destination word. -It is the inverse of the unzip instruction. -This instruction is available only on RV32.

-
-
Operation
-
-
-
-
-
foreach (i from 0 to xlen/2-1) {
-  X(rd)[2*i] = X(rs1)[i]
-  X(rd)[2*i+1] = X(rs1)[i+xlen/2]
-}
-
-
-
- - - - - -
- - -
Software Hint
-
-

This instruction is useful for implementing the SHA3 cryptographic -hash function on a 32-bit architecture, as it implements the -bit-interleaving operation used to speed up the 64-bit rotations -directly.

-
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbkb (Bit-manipulation for Cryptography) (RV32)

v1.0

Ratified

-
-
-
-

30.6. Software optimization guide

-
-

30.6.1. strlen

-
-

The orc.b instruction allows for the efficient detection of NUL bytes in an XLEN-sized chunk of data:

-
-
-
    -
  • -

    the result of orc.b on a chunk that does not contain any NUL bytes will be all-ones, and

    -
  • -
  • -

    after a bitwise-negation of the result of orc.b, the number of data bytes before the first NUL byte (if any) can be detected by ctz/clz (depending on the endianness of data).

    -
  • -
-
-
-

A full example of a strlen function, which uses these techniques and also demonstrates the use of it for unaligned/partial data, is the following:

-
-
-
-
#include <sys/asm.h>
-
-	.text
-	.globl strlen
-	.type  strlen, @function
-strlen:
-	andi	a3, a0, (SZREG-1)   // offset
-	andi    a1, a0, -SZREG      // align pointer
-.Lprologue:
-	li      a4, SZREG
-	sub     a4, a4, a3          // XLEN - offset
-	slli	a3, a3, 3           // offset * 8
-	REG_L   a2, 0(a1)           // chunk
-	/*
-	 * Shift the partial/unaligned chunk we loaded to remove the bytes
-	 * from before the start of the string, adding NUL bytes at the end.
-	 */
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-	srl	a2, a2 ,a3          // chunk >> (offset * 8)
-#else
-	sll     a2, a2, a3
-#endif
-	orc.b   a2, a2
-	not	a2, a2
-	/*
-	 * Non-NUL bytes in the string have been expanded to 0x00, while
- 	 * NUL bytes have become 0xff.  Search for the first set bit
-	 * (corresponding to a NUL byte in the original chunk).
-	 */
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-	ctz     a2, a2
-#else
-	clz     a2, a2
-#endif
-	/*
-	 * The first chunk is special: compare against the number of valid
-	 * bytes in this chunk.
-	 */
-	srli    a0, a2, 3
-	bgtu    a4, a0, .Ldone
-	addi    a3, a1, SZREG
-	li      a4, -1
-	.align 2
-	/*
-	 * Our critical loop is 4 instructions and processes data in 4 byte
-	 * or 8 byte chunks.
-	 */
-.Lloop:
-	REG_L   a2, SZREG(a1)
-	addi    a1, a1, SZREG
-	orc.b   a2, a2
-	beq     a2, a4, .Lloop
-
-.Lepilogue:
-	not     a2, a2
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-	ctz     a2, a2
-#else
-	clz     a2, a2
-#endif
-	sub     a1, a1, a3
-	add	a0, a0, a1
-	srli    a2, a2, 3
-	add 	a0, a0, a2
-.Ldone:
-	ret
-
-
-
-
-

30.6.2. strcmp

-
-
-
#include <sys/asm.h>
-
-  .text
-  .globl strcmp
-  .type  strcmp, @function
-strcmp:
-  or    a4, a0, a1
-  li    t2, -1
-  and   a4, a4, SZREG-1
-  bnez  a4, .Lsimpleloop
-
-  # Main loop for aligned strings
-.Lloop:
-  REG_L a2, 0(a0)
-  REG_L a3, 0(a1)
-  orc.b t0, a2
-  bne   t0, t2, .Lfoundnull
-  addi  a0, a0, SZREG
-  addi  a1, a1, SZREG
-  beq   a2, a3, .Lloop
-
-  # Words don't match, and no null byte in first word.
-  # Get bytes in big-endian order and compare.
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-  rev8  a2, a2
-  rev8  a3, a3
-#endif
-  # Synthesize (a2 >= a3) ? 1 : -1 in a branchless sequence.
-  sltu a0, a2, a3
-  neg  a0, a0
-  ori  a0, a0, 1
-  ret
-
-.Lfoundnull:
-  # Found a null byte.
-  # If words don't match, fall back to simple loop.
-  bne   a2, a3, .Lsimpleloop
-
-  # Otherwise, strings are equal.
-  li    a0, 0
-  ret
-
-  # Simple loop for misaligned strings
-.Lsimpleloop:
-  lbu   a2, 0(a0)
-  lbu   a3, 0(a1)
-  addi  a0, a0, 1
-  addi  a1, a1, 1
-  bne   a2, a3, 1f
-  bnez  a2, .Lsimpleloop
-
-1:
-  sub   a0, a2, a3
-  ret
-
-.size   strcmp, .-strcmp
-
-
-
-
-
-
-
-

31. "J" Extension for Dynamically Translated Languages, Version 0.0

-
-
-

This chapter is a placeholder for a future standard extension to support -dynamically translated languages.

-
-
- - - - - -
- - -
-

Many popular languages are usually implemented via dynamic translation, -including Java and Javascript. These languages can benefit from -additional ISA support for dynamic checks and garbage collection.

-
-
-
-
-
-
-

32. "P" Extension for Packed-SIMD Instructions, Version 0.2

-
-
- - - - - -
- - -
-

Discussions at the 5th RISC-V workshop indicated a desire to drop this -packed-SIMD proposal for floating-point registers in favor of -standardizing on the V extension for large floating-point SIMD -operations. However, there was interest in packed-SIMD fixed-point -operations for use in the integer registers of small RISC-V -implementations. A task group is working to define the new P extension.

-
-
-
-
-
-
-

33. "V" Standard Extension for Vector Operations, Version 1.0

-
-
-

CV32A65X: This extension is not supported.

-
-
-
-
-

34. Cryptography Extensions: Scalar & Entropy Source Instructions, Version 1.0.1

-
-
-

CV32A65X: These extensions are not supported.

-
-
-
-
-

35. Cryptography Extensions: Vector Instructions, Version 1.0

-
-
-

CV32A65X: These extensions are not supported.

-
-
-
-
-

36. Control-flow Integrity (CFI)

-
-
-

CV32A65X: The Zicfiss extension is not supported.

-
-
-

CV32A65X: The Zicfilp extension is not supported.

-
-
-
-
-

37. RV32/64G Instruction Set Listings

-
-
-

One goal of the RISC-V project is that it be used as a stable software -development target. For this purpose, we define a combination of a base -ISA (RV32I or RV64I) plus selected standard extensions (IMAFD, Zicsr, -Zifencei) as a "general-purpose" ISA, and we use the abbreviation G -for the IMAFDZicsr_Zifencei combination of instruction-set extensions.

-
-
-

CV32A65X: This chapter presents opcode maps and instruction-set -listings for CV32A65X.

-
- - ----------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 26. RISC-V base opcode map, inst[1:0]=11

inst[4:2]

000

001

010

011

100

101

110

111 (>32b)

inst[6:5]

00

LOAD

LOAD-FP

custom-0

MISC-MEM

OP-IMM

AUIPC

OP-IMM-32

48b

01

STORE

STORE-FP

custom-1

AMO

OP

LUI

OP-32

64b

10

MADD

MSUB

NMSUB

NMADD

OP-FP

OP-V

custom-2/rv128

48b

11

BRANCH

JALR

reserved

JAL

SYSTEM

OP-VE

custom-3/rv128

≥80b

-
-

Table 26 shows a map of the major opcodes for -RVG. Major opcodes with 3 or more lower bits set are reserved for -instruction lengths greater than 32 bits. Opcodes marked as reserved -should be avoided for custom instruction-set extensions as they might be -used by future standard extensions. Major opcodes marked as custom-0 -and custom-1 will be avoided by future standard extensions and are -recommended for use by custom instruction-set extensions within the base -32-bit instruction format. The opcodes marked custom-2/rv128 and -custom-3/rv128 are reserved for future use by RV128, but will -otherwise be avoided for standard extensions and so can also be used for -custom instruction-set extensions in RV32 and RV64.

-
-
-

We believe RV32G and RV64G provide simple but complete instruction sets -for a broad range of general-purpose computing. The optional compressed -instruction set described in Chapter 28 can -be added (forming RV32GC and RV64GC) to improve performance, code size, -and energy efficiency, though with some additional hardware complexity.

-
-
-

As we move beyond IMAFDC into further instruction-set extensions, the -added instructions tend to be more domain-specific and only provide -benefits to a restricted class of applications, e.g., for multimedia or -security. Unlike most commercial ISAs, the RISC-V ISA design clearly -separates the base ISA and broadly applicable standard extensions from -these more specialized additions. Chapter 38 -has a more extensive discussion of ways to add extensions to the RISC-V -ISA.

-
-
- ----------------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

31

27

26

25

24

20

19

15

14

12

11

7

6

0

funct7

rs2

rs1

funct3

rd

opcode

R-type

imm[11:0]

rs1

funct3

rd

opcode

I-type

imm[11:5]

rs2

rs1

funct3

imm[4:0]

opcode

S-type

imm[12|10:5]

rs2

rs1

funct3

imm[4:1|11]

opcode

B-type

imm[31:12]

rd

opcode

U-type

imm[20|10:1|11|19:12]

rd

opcode

J-type

- ----------------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

RV32I Base Instruction Set

imm[31:12]

rd

0110111

LUI

imm[31:12]

rd

0010111

AUIPC

imm[20|10:1|11|19:12]

rd

1101111

JAL

imm[11:0]

rs1

000

rd

1100111

JALR

imm[12|10:5]

rs2

rs1

000

imm[4:1|11]

1100011

BEQ

imm[12|10:5]

rs2

rs1

001

imm[4:1|11]

1100011

BNE

imm[12|10:5]

rs2

rs1

100

imm[4:1|11]

1100011

BLT

imm[12|10:5]

rs2

rs1

101

imm[4:1|11]

1100011

BGE

imm[12|10:5]

rs2

rs1

110

imm[4:1|11]

1100011

BLTU

imm[12|10:5]

rs2

rs1

111

imm[4:1|11]

1100011

BGEU

imm[11:0]

rs1

000

rd

0000011

LB

imm[11:0]

rs1

001

rd

0000011

LH

imm[11:0]

rs1

010

rd

0000011

LW

imm[11:0]

rs1

100

rd

0000011

LBU

imm[11:0]

rs1

101

rd

0000011

LHU

imm[11:5]

rs2

rs1

000

imm[4:0]

0100011

SB

imm[11:5]

rs2

rs1

001

imm[4:0]

0100011

SH

imm[11:5]

rs2

rs1

010

imm[4:0]

0100011

SW

imm[11:0]

rs1

000

rd

0010011

ADDI

imm[11:0]

rs1

010

rd

0010011

SLTI

imm[11:0]

rs1

011

rd

0010011

SLTIU

imm[11:0]

rs1

100

rd

0010011

XORI

imm[11:0]

rs1

110

rd

0010011

ORI

imm[11:0]

rs1

111

rd

0010011

ANDI

0000000

shamt

rs1

001

rd

0010011

SLLI

0000000

shamt

rs1

101

rd

0010011

SRLI

0100000

shamt

rs1

101

rd

0010011

SRAI

0000000

rs2

rs1

000

rd

0110011

ADD

0100000

rs2

rs1

000

rd

0110011

SUB

0000000

rs2

rs1

001

rd

0110011

SLL

0000000

rs2

rs1

010

rd

0110011

SLT

0000000

rs2

rs1

011

rd

0110011

SLTU

0000000

rs2

rs1

100

rd

0110011

XOR

0000000

rs2

rs1

101

rd

0110011

SRL

0100000

rs2

rs1

101

rd

0110011

SRA

0000000

rs2

rs1

110

rd

0110011

OR

0000000

rs2

rs1

111

rd

0110011

AND

fm

pred

succ

rs1

000

rd

0001111

FENCE

1000

0011

0011

00000

000

00000

0001111

FENCE.TSO

0000

0001

0000

00000

000

00000

0001111

PAUSE

000000000000

00000

000

00000

1110011

ECALL

000000000001

00000

000

00000

1110011

EBREAK

-
- ----------------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

31

27

26

25

24

20

19

15

14

12

11

7

6

0

funct7

rs2

rs1

funct3

rd

opcode

R-type

imm[11:0]

rs1

funct3

rd

opcode

I-type

imm[11:5]

rs2

rs1

funct3

imm[4:0]

opcode

S-type

- ----------------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

RV32/RV64 Zicsr Standard Extension

csr

rs1

001

rd

1110011

CSRRW

csr

rs1

010

rd

1110011

CSRRS

csr

rs1

011

rd

1110011

CSRRC

csr

uimm

101

rd

1110011

CSRRWI

csr

uimm

110

rd

1110011

CSRRSI

csr

uimm

111

rd

1110011

CSRRCI

- ----------------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

RV32M Standard Extension

0000001

rs2

rs1

000

rd

0110011

MUL

0000001

rs2

rs1

001

rd

0110011

MULH

0000001

rs2

rs1

010

rd

0110011

MULHSU

0000001

rs2

rs1

011

rd

0110011

MULHU

0000001

rs2

rs1

100

rd

0110011

DIV

0000001

rs2

rs1

101

rd

0110011

DIVU

0000001

rs2

rs1

110

rd

0110011

REM

0000001

rs2

rs1

111

rd

0110011

REMU

-
-
-

Table 27 lists the CSRs that have currently been -allocated CSR addresses. The timers, counters, and floating-point CSRs -are the only CSRs defined in this specification.

-
- - ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 27. RISC-V control and status register (CSR) address map.
NumberPrivilegeNameDescription

Floating-Point Control and Status Registers

0x001

Read write

fflags

Floating-Point Accrued Exceptions.

0x002

Read write

frm

Floating-Point Dynamic Rounding Mode.

0x003

Read write

fcsr

Floating-Point Control and Status Register (frm + fflags).

Counters and Timers

0xC00

Read-only

cycle

Cycle counter for RDCYCLE instruction.

0xC01

Read-only

time

Timer for RDTIME instruction.

0xC02

Read-only

instret

Instructions-retired counter for RDINSTRET instruction.

0xC80

Read-only

cycleh

Upper 32 bits of cycle, RV32I only.

0xC81

Read-only

timeh

Upper 32 bits of time, RV32I only.

0xC82

Read-only

instreth

Upper 32 bits of instret, RV32I only.

-
-
-
-

38. Extending RISC-V

-
-
-

In addition to supporting standard general-purpose software development, -another goal of RISC-V is to provide a basis for more specialized -instruction-set extensions or more customized accelerators. The -instruction encoding spaces and optional variable-length instruction -encoding are designed to make it easier to leverage software development -effort for the standard ISA toolchain when building more customized -processors. For example, the intent is to continue to provide full -software support for implementations that only use the standard I base, -perhaps together with many non-standard instruction-set extensions.

-
-
-

This chapter describes various ways in which the base RISC-V ISA can be -extended, together with the scheme for managing instruction-set -extensions developed by independent groups. This volume only deals with -the unprivileged ISA, although the same approach and terminology is used -for supervisor-level extensions described in the second volume.

-
-
-

38.1. Extension Terminology

-
-

This section defines some standard terminology for describing RISC-V -extensions.

-
-
-

38.1.1. Standard versus Non-Standard Extension

-
-

Any RISC-V processor implementation must support a base integer ISA -(RV32I, RV32E, RV64I, RV64E, or RV128I). In addition, an implementation may -support one or more extensions. We divide extensions into two broad -categories: standard versus non-standard.

-
-
-
    -
  • -

    A standard extension is one that is generally useful and that is -designed to not conflict with any other standard extension. Currently, -"MAFDQCBTPV", described in other chapters of this manual, are either -complete or planned standard extensions.

    -
  • -
  • -

    A non-standard extension may be highly specialized and may conflict -with other standard or non-standard extensions. We anticipate a wide -variety of non-standard extensions will be developed over time, with -some eventually being promoted to standard extensions.

    -
  • -
-
-
-
-

38.1.2. Instruction Encoding Spaces and Prefixes

-
-

An instruction encoding space is some number of instruction bits within -which a base ISA or ISA extension is encoded. RISC-V supports varying -instruction lengths, but even within a single instruction length, there -are various sizes of encoding space available. For example, the base -ISAs are defined within a 30-bit encoding space (bits 31-2 of the 32-bit -instruction), while the atomic extension "A" fits within a 25-bit -encoding space (bits 31-7).

-
-
-

We use the term prefix to refer to the bits to the right of an -instruction encoding space (since instruction fetch in RISC-V is -little-endian, the bits to the right are stored at earlier memory -addresses, hence form a prefix in instruction-fetch order). The prefix -for the standard base ISA encoding is the two-bit "11" field held in -bits 1-0 of the 32-bit word, while the prefix for the standard atomic -extension "A" is the seven-bit "0101111" field held in bits 6-0 of -the 32-bit word representing the AMO major opcode. A quirk of the -encoding format is that the 3-bit funct3 field used to encode a minor -opcode is not contiguous with the major opcode bits in the 32-bit -instruction format, but is considered part of the prefix for 22-bit -instruction spaces.

-
-
-

Although an instruction encoding space could be of any size, adopting a -smaller set of common sizes simplifies packing independently developed -extensions into a single global encoding. -Table 28 gives the suggested sizes for RISC-V.

-
- - -------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 28. Suggested standard RISC-V instruction encoding space sizes.
SizeUsage# Available in standard instruction length

16-bit

32-bit

48-bit

64-bit

14-bit

Quadrant of compressed 16-bit encoding

3

22-bit

Minor opcode in base 32-bit encoding

stem c4baab0af7a156d93ff31ad113e2dde5

stem 3d91b2575aebc0d742415f31a7138964

stem a5ee56a7cd1f96b794dd9fd33c17b806

25-bit

Major opcode in base 32-bit encoding

32

stem cec3b648d9a79b7951e2288a4cb712cd

stem 4077d56d7f287ffbdb80f419a66226a9

30-bit

Quadrant of base 32-bit encoding

1

stem 6a484de5eab2c8d47af90cbe3ffedf6e

stem 840e9073a2fd95fa3f885a3e28972b40

32-bit

Minor opcode in 48-bit encoding

stem 560d002b3cbfbdc26102752336160ae5

stem 106e24e6dc15f353b281ecc18a55b796

37-bit

Major opcode in 48-bit encoding

32

stem 3d91b2575aebc0d742415f31a7138964

40-bit

Quadrant of 48-bit encoding

4

stem cec3b648d9a79b7951e2288a4cb712cd

45-bit

Sub-minor opcode in 64-bit encoding

stem 6a484de5eab2c8d47af90cbe3ffedf6e

48-bit

Minor opcode in 64-bit encoding

stem 96cee3dcfd74a2467659d99ac60808d7

52-bit

Major opcode in 64-bit encoding

32

-
-
-

38.1.3. Greenfield versus Brownfield Extensions

-
-

We use the term greenfield extension to describe an extension that -begins populating a new instruction encoding space, and hence can only -cause encoding conflicts at the prefix level. We use the term -brownfield extension to describe an extension that fits around -existing encodings in a previously defined instruction space. A -brownfield extension is necessarily tied to a particular greenfield -parent encoding, and there may be multiple brownfield extensions to the -same greenfield parent encoding. For example, the base ISAs are -greenfield encodings of a 30-bit instruction space, while the FDQ -floating-point extensions are all brownfield extensions adding to the -parent base ISA 30-bit encoding space.

-
-
-

Note that we consider the standard A extension to have a greenfield -encoding as it defines a new previously empty 25-bit encoding space in -the leftmost bits of the full 32-bit base instruction encoding, even -though its standard prefix locates it within the 30-bit encoding space -of its parent base ISA. Changing only its single 7-bit prefix could move -the A extension to a different 30-bit encoding space while only worrying -about conflicts at the prefix level, not within the encoding space -itself.

-
- - ----- - - - - - - - - - - - - - - - - - - - -
Table 29. Two-dimensional characterization of standard instruction-set extensions.
Adds stateNo new state

Greenfield

RV32I(30), RV64I(30)

A(25)

Brownfield

F(I), D(F), Q(D)

M(I)

-
-

Table 29 shows the bases and standard extensions placed -in a simple two-dimensional taxonomy. One axis is whether the extension -is greenfield or brownfield, while the other axis is whether the -extension adds architectural state. For greenfield extensions, the size -of the instruction encoding space is given in parentheses. For -brownfield extensions, the name of the extension (greenfield or -brownfield) it builds upon is given in parentheses. Additional -user-level architectural state usually implies changes to the -supervisor-level system or possibly to the standard calling convention.

-
-
-

Note that RV64I is not considered an extension of RV32I, but a different -complete base encoding.

-
-
-
-

38.1.4. Standard-Compatible Global Encodings

-
-

A complete or global encoding of an ISA for an actual RISC-V -implementation must allocate a unique non-conflicting prefix for every -included instruction encoding space. The bases and every standard -extension have each had a standard prefix allocated to ensure they can -all coexist in a global encoding.

-
-
-

A standard-compatible global encoding is one where the base and every -included standard extension have their standard prefixes. A -standard-compatible global encoding can include non-standard extensions -that do not conflict with the included standard extensions. A -standard-compatible global encoding can also use standard prefixes for -non-standard extensions if the associated standard extensions are not -included in the global encoding. In other words, a standard extension -must use its standard prefix if included in a standard-compatible global -encoding, but otherwise its prefix is free to be reallocated. These -constraints allow a common toolchain to target the standard subset of -any RISC-V standard-compatible global encoding.

-
-
-
-

38.1.5. Guaranteed Non-Standard Encoding Space

-
-

To support development of proprietary custom extensions, portions of the -encoding space are guaranteed to never be used by standard extensions.

-
-
-
-
-

38.2. RISC-V Extension Design Philosophy

-
-

We intend to support a large number of independently developed -extensions by encouraging extension developers to operate within -instruction encoding spaces, and by providing tools to pack these into a -standard-compatible global encoding by allocating unique prefixes. Some -extensions are more naturally implemented as brownfield augmentations of -existing extensions, and will share whatever prefix is allocated to -their parent greenfield extension. The standard extension prefixes avoid -spurious incompatibilities in the encoding of core functionality, while -allowing custom packing of more esoteric extensions.

-
-
-

This capability of repacking RISC-V extensions into different -standard-compatible global encodings can be used in a number of ways.

-
-
-

One use-case is developing highly specialized custom accelerators, -designed to run kernels from important application domains. These might -want to drop all but the base integer ISA and add in only the extensions -that are required for the task in hand. The base ISAs have been designed -to place minimal requirements on a hardware implementation, and has been -encoded to use only a small fraction of a 32-bit instruction encoding -space.

-
-
-

Another use-case is to build a research prototype for a new type of -instruction-set extension. The researchers might not want to expend the -effort to implement a variable-length instruction-fetch unit, and so -would like to prototype their extension using a simple 32-bit -fixed-width instruction encoding. However, this new extension might be -too large to coexist with standard extensions in the 32-bit space. If -the research experiments do not need all of the standard extensions, a -standard-compatible global encoding might drop the unused standard -extensions and reuse their prefixes to place the proposed extension in a -non-standard location to simplify engineering of the research prototype. -Standard tools will still be able to target the base and any standard -extensions that are present to reduce development time. Once the -instruction-set extension has been evaluated and refined, it could then -be made available for packing into a larger variable-length encoding -space to avoid conflicts with all standard extensions.

-
-
-

The following sections describe increasingly sophisticated strategies -for developing implementations with new instruction-set extensions. -These are mostly intended for use in highly customized, educational, or -experimental architectures rather than for the main line of RISC-V ISA -development.

-
-
-
-

38.3. Extensions within fixed-width 32-bit instruction format

-
-

In this section, we discuss adding extensions to implementations that -only support the base fixed-width 32-bit instruction format.

-
-
- - - - - -
- - -
-

We anticipate the simplest fixed-width 32-bit encoding will be popular -for many restricted accelerators and research prototypes.

-
-
-
-
-

38.3.1. Available 30-bit instruction encoding spaces

-
-

In the standard encoding, three of the available 30-bit instruction -encoding spaces (those with 2-bit prefixes 00, 01, and 10) are used to -enable the optional compressed instruction extension. However, if the -compressed instruction-set extension is not required, then these three -further 30-bit encoding spaces become available. This quadruples the -available encoding space within the 32-bit format.

-
-
-
-

38.3.2. Available 25-bit instruction encoding spaces

-
-

A 25-bit instruction encoding space corresponds to a major opcode in the -base and standard extension encodings.

-
-
-

There are four major opcodes expressly designated for custom extensions -Table 26, each of which represents a 25-bit -encoding space. Two of these are reserved for eventual use in the RV128 -base encoding (will be OP-IMM-64 and OP-64), but can be used for -non-standard extensions for RV32 and RV64.

-
-
-

The two major opcodes reserved for RV64 (OP-IMM-32 and OP-32) can also -be used for non-standard extensions to RV32 only.

-
-
-

If an implementation does not require floating-point, then the seven -major opcodes reserved for standard floating-point extensions (LOAD-FP, -STORE-FP, MADD, MSUB, NMSUB, NMADD, OP-FP) can be reused for -non-standard extensions. Similarly, the AMO major opcode can be reused -if the standard atomic extensions are not required.

-
-
-

If an implementation does not require instructions longer than 32-bits, -then an additional four major opcodes are available (those marked in -gray in Table 26).

-
-
-

The base RV32I encoding uses only 11 major opcodes plus 3 reserved -opcodes, leaving up to 18 available for extensions. The base RV64I -encoding uses only 13 major opcodes plus 3 reserved opcodes, leaving up -to 16 available for extensions.

-
-
-
-

38.3.3. Available 22-bit instruction encoding spaces

-
-

A 22-bit encoding space corresponds to a funct3 minor opcode space in -the base and standard extension encodings. Several major opcodes have a -funct3 field minor opcode that is not completely occupied, leaving -available several 22-bit encoding spaces.

-
-
-

Usually a major opcode selects the format used to encode operands in the -remaining bits of the instruction, and ideally, an extension should -follow the operand format of the major opcode to simplify hardware -decoding.

-
-
-
-

38.3.4. Other spaces

-
-

Smaller spaces are available under certain major opcodes, and not all -minor opcodes are entirely filled.

-
-
-
-
-

38.4. Adding aligned 64-bit instruction extensions

-
-

The simplest approach to provide space for extensions that are too large -for the base 32-bit fixed-width instruction format is to add naturally -aligned 64-bit instructions. The implementation must still support the -32-bit base instruction format, but can require that 64-bit instructions -are aligned on 64-bit boundaries to simplify instruction fetch, with a -32-bit NOP instruction used as alignment padding where necessary.

-
-
-

To simplify use of standard tools, the 64-bit instructions should be -encoded as described in Table 1. -However, an implementation might choose a non-standard -instruction-length encoding for 64-bit instructions, while retaining the -standard encoding for 32-bit instructions. For example, if compressed -instructions are not required, then a 64-bit instruction could be -encoded using one or more zero bits in the first two bits of an -instruction.

-
-
- - - - - -
- - -
-

We anticipate processor generators that produce instruction-fetch units -capable of automatically handling any combination of supported -variable-length instruction encodings.

-
-
-
-
-
-

38.5. Supporting VLIW encodings

-
-

Although RISC-V was not designed as a base for a pure VLIW machine, VLIW -encodings can be added as extensions using several alternative -approaches. In all cases, the base 32-bit encoding has to be supported -to allow use of any standard software tools.

-
-
-

38.5.1. Fixed-size instruction group

-
-

The simplest approach is to define a single large naturally aligned -instruction format (e.g., 128 bits) within which VLIW operations are -encoded. In a conventional VLIW, this approach would tend to waste -instruction memory to hold NOPs, but a RISC-V-compatible implementation -would have to also support the base 32-bit instructions, confining the -VLIW code size expansion to VLIW-accelerated functions.

-
-
-
-

38.5.2. Encoded-Length Groups

-
-

Another approach is to use the standard length encoding from -Table 1 to encode parallel -instruction groups, allowing NOPs to be compressed out of the VLIW -instruction. For example, a 64-bit instruction could hold two 28-bit -operations, while a 96-bit instruction could hold three 28-bit -operations, and so on. Alternatively, a 48-bit instruction could hold -one 42-bit operation, while a 96-bit instruction could hold two 42-bit -operations, and so on.

-
-
-

This approach has the advantage of retaining the base ISA encoding for -instructions holding a single operation, but has the disadvantage of -requiring a new 28-bit or 42-bit encoding for operations within the VLIW -instructions, and misaligned instruction fetch for larger groups. One -simplification is to not allow VLIW instructions to straddle certain -microarchitecturally significant boundaries (e.g., cache lines or -virtual memory pages).

-
-
-
-

38.5.3. Fixed-Size Instruction Bundles

-
-

Another approach, similar to Itanium, is to use a larger naturally -aligned fixed instruction bundle size (e.g., 128 bits) across which -parallel operation groups are encoded. This simplifies instruction -fetch, but shifts the complexity to the group execution engine. To -remain RISC-V compatible, the base 32-bit instruction would still have -to be supported.

-
-
-
-

38.5.4. End-of-Group bits in Prefix

-
-

None of the above approaches retains the RISC-V encoding for the -individual operations within a VLIW instruction. Yet another approach is -to repurpose the two prefix bits in the fixed-width 32-bit encoding. One -prefix bit can be used to signal "end-of-group" if set, while the -second bit could indicate execution under a predicate if clear. Standard -RISC-V 32-bit instructions generated by tools unaware of the VLIW -extension would have both prefix bits set (11) and thus have the correct -semantics, with each instruction at the end of a group and not -predicated.

-
-
-

The main disadvantage of this approach is that the base ISAs lack the -complex predication support usually required in an aggressive VLIW -system, and it is difficult to add space to specify more predicate -registers in the standard 30-bit encoding space.

-
-
-
-
-
-
-

39. ISA Extension Naming Conventions

-
-
-

This chapter describes the RISC-V ISA extension naming scheme that is -used to concisely describe the set of instructions present in a hardware -implementation, or the set of instructions used by an application binary -interface (ABI).

-
-
- - - - - -
- - -
-

The RISC-V ISA is designed to support a wide variety of implementations -with various experimental instruction-set extensions. We have found that -an organized naming scheme simplifies software tools and documentation.

-
-
-
-
-

39.1. Case Sensitivity

-
-

The ISA naming strings are case insensitive.

-
-
-
-

39.2. Base Integer ISA

-
-

RISC-V ISA strings begin with either RV32I, RV32E, RV64I, RV64E, or RV128I -indicating the supported address space size in bits for the base integer -ISA.

-
-
-
-

39.3. Instruction-Set Extension Names

-
-

Standard ISA extensions are given a name consisting of a single letter. -For example, the first four standard extensions to the integer bases -are: "M" for integer multiplication and division, "A" for atomic -memory instructions, "F" for single-precision floating-point -instructions, and "D" for double-precision floating-point -instructions. Any RISC-V instruction-set variant can be succinctly -described by concatenating the base integer prefix with the names of the -included extensions, e.g., "RV64IMAFD".

-
-
-

We have also defined an abbreviation "G" to represent the -"IMAFDZicsr_Zifencei" base and extensions, as this is intended to -represent our standard general-purpose ISA.

-
-
-

Standard extensions to the RISC-V ISA are given other reserved letters, -e.g., "Q" for quad-precision floating-point, or "C" for the 16-bit -compressed instruction format.

-
-
-

Some ISA extensions depend on the presence of other extensions, e.g., -"D" depends on "F" and "F" depends on "Zicsr". These dependencies -may be implicit in the ISA name: for example, RV32IF is equivalent to -RV32IFZicsr, and RV32ID is equivalent to RV32IFD and RV32IFDZicsr.

-
-
-
-

39.4. Version Numbers

-
-

Recognizing that instruction sets may expand or alter over time, we -encode extension version numbers following the extension name. Version -numbers are divided into major and minor version numbers, separated by a -"p". If the minor version is "0", then "p0" can be omitted from -the version string. Changes in major version numbers imply a loss of -backwards compatibility, whereas changes in only the minor version -number must be backwards-compatible. For example, the original 64-bit -standard ISA defined in release 1.0 of this manual can be written in -full as "RV64I1p0M1p0A1p0F1p0D1p0", more concisely as -"RV64I1M1A1F1D1".

-
-
-

We introduced the version numbering scheme with the second release. -Hence, we define the default version of a standard extension to be the -version present at that time, e.g., "RV32I" is equivalent to -"RV32I2".

-
-
-
-

39.5. Underscores

-
-

Underscores "_" may be used to separate ISA extensions to improve -readability and to provide disambiguation, e.g., "RV32I2_M2_A2".

-
-
-

Because the "P" extension for Packed SIMD can be confused for the -decimal point in a version number, it must be preceded by an underscore -if it follows a number. For example, "rv32i2p2" means version 2.2 of -RV32I, whereas "rv32i2_p2" means version 2.0 of RV32I with version 2.0 -of the P extension.

-
-
-
-

39.6. Additional Standard Unprivileged Extension Names

-
-

Standard unprivileged extensions can also be named using a single "Z" followed by -an alphabetical name and an optional version number. For example, -"Zifencei" names the instruction-fetch fence extension described in -Chapter 6; "Zifencei2" and -"Zifencei2p0" name version 2.0 of same.

-
-
-

The first letter following the "Z" conventionally indicates the most -closely related alphabetical extension category, IMAFDQLCBKJTPVH. For the -"Zfa" extension for additional floating-point instructions, for example, the letter "f" -indicates the extension is related to the "F" standard extension. If -multiple "Z" extensions are named, they should be ordered first by -category, then alphabetically within a category—for example, -"Zicsr_Zifencei_Zam".

-
-
-

All multi-letter extensions, including those with the "Z" prefix, must be -separated from other multi-letter extensions by an underscore, e.g., -"RV32IMACZicsr_Zifencei".

-
-
-
-

39.7. Supervisor-level Instruction-Set Extensions

-
-

Standard extensions that extend the supervisor-level virtual-memory -architecture are prefixed with the letters "Sv", followed by an alphabetical -name and an optional version number, or by a numeric name with no version number. -Other standard extensions that extend -the supervisor-level architecture are prefixed with the letters "Ss", -followed by an alphabetical name and an optional version number. Such -extensions are defined in Volume II.

-
-
-

Standard supervisor-level extensions should be listed after standard -unprivileged extensions. If multiple supervisor-level extensions are -listed, they should be ordered alphabetically.

-
-
-
-

39.8. Hypervisor-level Instruction-Set Extensions

-
-

Standard extensions that extend the hypervisor-level architecture are prefixed -with the letters "Sh". -If multiple hypervisor-level extensions are listed, they should be ordered -alphabetically.

-
-
- - - - - -
- - -Many augmentations to the hypervisor-level archtecture are more -naturally defined as supervisor-level extensions, following the scheme -described in the previous section. -The "Sh" prefix is used by the few hypervisor-level extensions that have no -supervisor-visible effects. -
-
-
-
-

39.9. Machine-level Instruction-Set Extensions

-
-

Standard machine-level instruction-set extensions are prefixed with the -letters "Sm".

-
-
-

Standard machine-level extensions should be listed after standard -lesser-privileged extensions. If multiple machine-level extensions are -listed, they should be ordered alphabetically.

-
-
-
-

39.10. Non-Standard Extension Names

-
-

Non-standard extensions are named using a single "X" followed by an -alphabetical name and an optional version number. For example, -"Xhwacha" names the Hwacha vector-fetch ISA extension; "Xhwacha2" -and "Xhwacha2p0" name version 2.0 of same.

-
-
-

Non-standard extensions must be listed after all standard extensions, and, -like other multi-letter extensions, must be separated from other multi-letter -extensions by an underscore. -For example, an ISA with non-standard extensions Argle and -Bargle may be named "RV64IZifencei_Xargle_Xbargle".

-
-
-

If multiple non-standard extensions are listed, they should be ordered -alphabetically.

-
-
-
-

39.11. Subset Naming Convention

-
-

Table 30 summarizes the standardized extension -names. The table also defines the canonical -order in which extension names must appear in the name string, with -top-to-bottom in table indicating first-to-last in the name string, -e.g., RV32IMACV is legal, whereas RV32IMAVC is not.

-
- - ----- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 30. Standard ISA extension names.
SubsetNameImplies

Base ISA

Integer

I

Reduced Integer

E

Standard Unprivileged Extensions

Integer Multiplication and Division

M

Zmmul

Atomics

A

Single-Precision Floating-Point

F

Zicsr

Double-Precision Floating-Point

D

F

General

G

IMAFDZicsr_Zifencei

Quad-Precision Floating-Point

Q

D

16-bit Compressed Instructions

C

B Extension

B

Packed-SIMD Extensions

P

Vector Extension

V

D

Hypervisor Extension

H

Additional Standard Unprivileged Extensions

Additional Standard unprivileged extensions "abc"

Zabc

Standard Supervisor-Level Extensions

Supervisor-level extension "def"

Ssdef

Standard Machine-Level Extensions

Machine-level extension "jkl"

Smjkl

Non-Standard Extensions

Non-standard extension "mno"

Xmno

-
-
-
-
-

40. History and Acknowledgments

-
-
-

40.1. "Why Develop a new ISA?" Rationale from Berkeley Group

-
-

We developed RISC-V to support our own needs in research and education, -where our group is particularly interested in actual hardware -implementations of research ideas (we have completed eleven different -silicon fabrications of RISC-V since the first edition of this -specification), and in providing real implementations for students to -explore in classes (RISC-V processor RTL designs have been used in -multiple undergraduate and graduate classes at Berkeley). In our current -research, we are especially interested in the move towards specialized -and heterogeneous accelerators, driven by the power constraints imposed -by the end of conventional transistor scaling. We wanted a highly -flexible and extensible base ISA around which to build our research -effort.

-
-
-

A question we have been repeatedly asked is "Why develop a new ISA?" -The biggest obvious benefit of using an existing commercial ISA is the -large and widely supported software ecosystem, both development tools -and ported applications, which can be leveraged in research and -teaching. Other benefits include the existence of large amounts of -documentation and tutorial examples. However, our experience of using -commercial instruction sets for research and teaching is that these -benefits are smaller in practice, and do not outweigh the disadvantages:

-
-
-
    -
  • -

    Commercial ISAs are proprietary. Except for SPARC V8, which is an -open IEEE standard (IEEE Standard for a 32-Bit Microprocessor, 1994) , most owners of commercial ISAs carefully guard -their intellectual property and do not welcome freely available -competitive implementations. This is much less of an issue for academic -research and teaching using only software simulators, but has been a -major concern for groups wishing to share actual RTL implementations. It -is also a major concern for entities who do not want to trust the few -sources of commercial ISA implementations, but who are prohibited from -creating their own clean room implementations. We cannot guarantee that -all RISC-V implementations will be free of third-party patent -infringements, but we can guarantee we will not attempt to sue a RISC-V -implementor.

    -
  • -
  • -

    Commercial ISAs are only popular in certain market domains. The most -obvious examples at time of writing are that the ARM architecture is not -well supported in the server space, and the Intel x86 architecture (or -for that matter, almost every other architecture) is not well supported -in the mobile space, though both Intel and ARM are attempting to enter -each other’s market segments. Another example is ARC and Tensilica, -which provide extensible cores but are focused on the embedded space. -This market segmentation dilutes the benefit of supporting a particular -commercial ISA as in practice the software ecosystem only exists for -certain domains, and has to be built for others.

    -
  • -
  • -

    Commercial ISAs come and go. Previous research infrastructures have -been built around commercial ISAs that are no longer popular (SPARC, -MIPS) or even no longer in production (Alpha). These lose the benefit of -an active software ecosystem, and the lingering intellectual property -issues around the ISA and supporting tools interfere with the ability of -interested third parties to continue supporting the ISA. An open ISA -might also lose popularity, but any interested party can continue using -and developing the ecosystem.

    -
  • -
  • -

    Popular commercial ISAs are complex. The dominant commercial ISAs -(x86 and ARM) are both very complex to implement in hardware to the -level of supporting common software stacks and operating systems. Worse, -nearly all the complexity is due to bad, or at least outdated, ISA -design decisions rather than features that truly improve efficiency.

    -
  • -
  • -

    Commercial ISAs alone are not enough to bring up applications. Even -if we expend the effort to implement a commercial ISA, this is not -enough to run existing applications for that ISA. Most applications need -a complete ABI (application binary interface) to run, not just the -user-level ISA. Most ABIs rely on libraries, which in turn rely on -operating system support. To run an existing operating system requires -implementing the supervisor-level ISA and device interfaces expected by -the OS. These are usually much less well-specified and considerably more -complex to implement than the user-level ISA.

    -
  • -
  • -

    Popular commercial ISAs were not designed for extensibility. The -dominant commercial ISAs were not particularly designed for -extensibility, and as a consequence have added considerable instruction -encoding complexity as their instruction sets have grown. Companies such -as Tensilica (acquired by Cadence) and ARC (acquired by Synopsys) have -built ISAs and toolchains around extensibility, but have focused on -embedded applications rather than general-purpose computing systems.

    -
  • -
  • -

    A modified commercial ISA is a new ISA. One of our main goals is to -support architecture research, including major ISA extensions. Even -small extensions diminish the benefit of using a standard ISA, as -compilers have to be modified and applications rebuilt from source code -to use the extension. Larger extensions that introduce new architectural -state also require modifications to the operating system. Ultimately, -the modified commercial ISA becomes a new ISA, but carries along all the -legacy baggage of the base ISA.

    -
  • -
-
-
-

Our position is that the ISA is perhaps the most important interface in -a computing system, and there is no reason that such an important -interface should be proprietary. The dominant commercial ISAs are based -on instruction-set concepts that were already well known over 30 years -ago. Software developers should be able to target an open standard -hardware target, and commercial processor designers should compete on -implementation quality.

-
-
-

We are far from the first to contemplate an open ISA design suitable for -hardware implementation. We also considered other existing open ISA -designs, of which the closest to our goals was the OpenRISC -architecture (OpenCores, 2012). We decided against adopting the OpenRISC ISA for several -technical reasons:

-
-
-
    -
  • -

    OpenRISC has condition codes and branch delay slots, which complicate -higher performance implementations.

    -
  • -
  • -

    OpenRISC uses a fixed 32-bit encoding and 16-bit immediates, which -precludes a denser instruction encoding and limits space for later -expansion of the ISA.

    -
  • -
  • -

    OpenRISC does not support the 2008 revision to the IEEE 754 -floating-point standard.

    -
  • -
  • -

    The OpenRISC 64-bit design had not been completed when we began.

    -
  • -
-
-
-

By starting from a clean slate, we could design an ISA that met all of -our goals, though of course, this took far more effort than we had -planned at the outset. We have now invested considerable effort in -building up the RISC-V ISA infrastructure, including documentation, -compiler tool chains, operating system ports, reference ISA simulators, -FPGA implementations, efficient ASIC implementations, architecture test -suites, and teaching materials. Since the last edition of this manual, -there has been considerable uptake of the RISC-V ISA in both academia -and industry, and we have created the non-profit RISC-V Foundation to -protect and promote the standard. The RISC-V Foundation website at -riscv.org contains the latest information on the Foundation -membership and various open-source projects using RISC-V.

-
-
-
-

40.2. History from Revision 1.0 of ISA manual

-
-

The RISC-V ISA and instruction-set manual builds upon several earlier -projects. Several aspects of the supervisor-level machine and the -overall format of the manual date back to the T0 (Torrent-0) vector -microprocessor project at UC Berkeley and ICSI, begun in 1992. T0 was a -vector processor based on the MIPS-II ISA, with Krste Asanović as main -architect and RTL designer, and Brian Kingsbury and Bertrand Irrisou as -principal VLSI implementors. David Johnson at ICSI was a major -contributor to the T0 ISA design, particularly supervisor mode, and to -the manual text. John Hauser also provided considerable feedback on the -T0 ISA design.

-
-
-

The Scale (Software-Controlled Architecture for Low Energy) project at -MIT, begun in 2000, built upon the T0 project infrastructure, refined -the supervisor-level interface, and moved away from the MIPS scalar ISA -by dropping the branch delay slot. Ronny Krashinsky and Christopher -Batten were the principal architects of the Scale Vector-Thread -processor at MIT, while Mark Hampton ported the GCC-based compiler -infrastructure and tools for Scale.

-
-
-

A lightly edited version of the T0 MIPS scalar processor specification -(MIPS-6371) was used in teaching a new version of the MIT 6.371 -Introduction to VLSI Systems class in the Fall 2002 semester, with Chris -Terman and Krste Asanović as lecturers. Chris Terman contributed most of -the lab material for the class (there was no TA!). The 6.371 class -evolved into the trial 6.884 Complex Digital Design class at MIT, taught -by Arvind and Krste Asanović in Spring 2005, which became a regular -Spring class 6.375. A reduced version of the Scale MIPS-based scalar -ISA, named SMIPS, was used in 6.884/6.375. Christopher Batten was the TA -for the early offerings of these classes and developed a considerable -amount of documentation and lab material based around the SMIPS ISA. -This same SMIPS lab material was adapted and enhanced by TA Yunsup Lee -for the UC Berkeley Fall 2009 CS250 VLSI Systems Design class taught by -John Wawrzynek, Krste Asanović, and John Lazzaro.

-
-
-

The Maven (Malleable Array of Vector-thread ENgines) project was a -second-generation vector-thread architecture. Its design was led by -Christopher Batten when he was an Exchange Scholar at UC Berkeley -starting in summer 2007. Hidetaka Aoki, a visiting industrial fellow -from Hitachi, gave considerable feedback on the early Maven ISA and -microarchitecture design. The Maven infrastructure was based on the -Scale infrastructure but the Maven ISA moved further away from the MIPS -ISA variant defined in Scale, with a unified floating-point and integer -register file. Maven was designed to support experimentation with -alternative data-parallel accelerators. Yunsup Lee was the main -implementor of the various Maven vector units, while Rimas Avižienis was -the main implementor of the various Maven scalar units. Yunsup Lee and -Christopher Batten ported GCC to work with the new Maven ISA. -Christopher Celio provided the initial definition of a traditional -vector instruction set ("Flood") variant of Maven.

-
-
-

Based on experience with all these previous projects, the RISC-V ISA -definition was begun in Summer 2010, with Andrew Waterman, Yunsup Lee, -Krste Asanović, and David Patterson as principal designers. An initial -version of the RISC-V 32-bit instruction subset was used in the UC -Berkeley Fall 2010 CS250 VLSI Systems Design class, with Yunsup Lee as -TA. RISC-V is a clean break from the earlier MIPS-inspired designs. John -Hauser contributed to the floating-point ISA definition, including the -sign-injection instructions and a register encoding scheme that permits -internal recoding of floating-point values.

-
-
-
-

40.3. History from Revision 2.0 of ISA manual

-
-

Multiple implementations of RISC-V processors have been completed, -including several silicon fabrications, as shown in -Fabricated RISC-V testchips table.

-
- ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
NameTapeout DateProcessISA

Raven-1

May 29, 2011

ST 28nm FDSOI

RV64G1_Xhwacha1

EOS14

April 1, 2012

IBM 45nm SOI

RV64G1p1_Xhwacha2

EOS16

August 17, 2012

IBM 45nm SOI

RV64G1p1_Xhwacha2

Raven-2

August 22, 2012

ST 28nm FDSOI

RV64G1p1_Xhwacha2

EOS18

February 6, 2013

IBM 45nm SOI

RV64G1p1_Xhwacha2

EOS20

July 3, 2013

IBM 45nm SOI

RV64G1p99_Xhwacha2

Raven-3

September 26, 2013

ST 28nm SOI

RV64G1p99_Xhwacha2

EOS22

March 7, 2014

IBM 45nm SOI

RV64G1p9999_Xhwacha3

-
-

The first RISC-V processors to be fabricated were written in Verilog and -manufactured in a pre-production FDSOI technology from ST as the Raven-1 -testchip in 2011. Two cores were developed by Yunsup Lee and Andrew -Waterman, advised by Krste Asanović, and fabricated together: 1) an RV64 -scalar core with error-detecting flip-flops, and 2) an RV64 core with an -attached 64-bit floating-point vector unit. The first microarchitecture -was informally known as "TrainWreck", due to the short time available -to complete the design with immature design libraries.

-
-
-

Subsequently, a clean microarchitecture for an in-order decoupled RV64 -core was developed by Andrew Waterman, Rimas Avižienis, and Yunsup Lee, -advised by Krste Asanović, and, continuing the railway theme, was -codenamed "Rocket" after George Stephenson’s successful steam -locomotive design. Rocket was written in Chisel, a new hardware design -language developed at UC Berkeley. The IEEE floating-point units used in -Rocket were developed by John Hauser, Andrew Waterman, and Brian -Richards. Rocket has since been refined and developed further, and has -been fabricated two more times in FDSOI (Raven-2, Raven-3), and five -times in IBM SOI technology (EOS14, EOS16, EOS18, EOS20, EOS22) for a -photonics project. Work is ongoing to make the Rocket design available -as a parameterized RISC-V processor generator.

-
-
-

EOS14-EOS22 chips include early versions of Hwacha, a 64-bit IEEE -floating-point vector unit, developed by Yunsup Lee, Andrew Waterman, -Huy Vo, Albert Ou, Quan Nguyen, and Stephen Twigg, advised by Krste -Asanović. EOS16-EOS22 chips include dual cores with a cache-coherence -protocol developed by Henry Cook and Andrew Waterman, advised by Krste -Asanović. EOS14 silicon has successfully run at 1.25 GHz. EOS16 silicon suffered -from a bug in the IBM pad libraries. EOS18 and EOS20 have successfully -run at 1.35 GHz.

-
-
-

Contributors to the Raven testchips include Yunsup Lee, Andrew Waterman, -Rimas Avižienis, Brian Zimmer, Jaehwa Kwak, Ruzica Jevtić, Milovan -Blagojević, Alberto Puggelli, Steven Bailey, Ben Keller, Pi-Feng Chiu, -Brian Richards, Borivoje Nikolić, and Krste Asanović.

-
-
-

Contributors to the EOS testchips include Yunsup Lee, Rimas Avižienis, -Andrew Waterman, Henry Cook, Huy Vo, Daiwei Li, Chen Sun, Albert Ou, -Quan Nguyen, Stephen Twigg, Vladimir Stojanović, and Krste Asanović.

-
-
-

Andrew Waterman and Yunsup Lee developed the C++ ISA simulator -"Spike", used as a golden model in development and named after the -golden spike used to celebrate completion of the US transcontinental -railway. Spike has been made available as a BSD open-source project.

-
-
-

Andrew Waterman completed a Master’s thesis with a preliminary design of -the RISC-V compressed instruction set (Waterman, 2011).

-
-
-

Various FPGA implementations of the RISC-V have been completed, -primarily as part of integrated demos for the Par Lab project research -retreats. The largest FPGA design has 3 cache-coherent RV64IMA -processors running a research operating system. Contributors to the FPGA -implementations include Andrew Waterman, Yunsup Lee, Rimas Avižienis, -and Krste Asanović.

-
-
-

RISC-V processors have been used in several classes at UC Berkeley. -Rocket was used in the Fall 2011 offering of CS250 as a basis for class -projects, with Brian Zimmer as TA. For the undergraduate CS152 class in -Spring 2012, Christopher Celio used Chisel to write a suite of -educational RV32 processors, named "Sodor" after the island on which -"Thomas the Tank Engine" and friends live. The suite includes a -microcoded core, an unpipelined core, and 2, 3, and 5-stage pipelined -cores, and is publicly available under a BSD license. The suite was -subsequently updated and used again in CS152 in Spring 2013, with Yunsup -Lee as TA, and in Spring 2014, with Eric Love as TA. Christopher Celio -also developed an out-of-order RV64 design known as BOOM (Berkeley -Out-of-Order Machine), with accompanying pipeline visualizations, that -was used in the CS152 classes. The CS152 classes also used -cache-coherent versions of the Rocket core developed by Andrew Waterman -and Henry Cook.

-
-
-

Over the summer of 2013, the RoCC (Rocket Custom Coprocessor) interface -was defined to simplify adding custom accelerators to the Rocket core. -Rocket and the RoCC interface were used extensively in the Fall 2013 -CS250 VLSI class taught by Jonathan Bachrach, with several student -accelerator projects built to the RoCC interface. The Hwacha vector unit -has been rewritten as a RoCC coprocessor.

-
-
-

Two Berkeley undergraduates, Quan Nguyen and Albert Ou, have -successfully ported Linux to run on RISC-V in Spring 2013.

-
-
-

Colin Schmidt successfully completed an LLVM backend for RISC-V 2.0 in -January 2014.

-
-
-

Darius Rad at Bluespec contributed soft-float ABI support to the GCC -port in March 2014.

-
-
-

John Hauser contributed the definition of the floating-point -classification instructions.

-
-
-

We are aware of several other RISC-V core implementations, including one -in Verilog by Tommy Thorn, and one in Bluespec by Rishiyur Nikhil.

-
-
-
-

40.4. Acknowledgments

-
-

Thanks to Christopher F. Batten, Preston Briggs, Christopher Celio, -David Chisnall, Stefan Freudenberger, John Hauser, Ben Keller, Rishiyur -Nikhil, Michael Taylor, Tommy Thorn, and Robert Watson for comments on -the draft ISA version 2.0 specification.

-
-
-
-

40.5. History from Revision 2.1

-
-

Uptake of the RISC-V ISA has been very rapid since the introduction of -the frozen version 2.0 in May 2014, with too much activity to record in -a short history section such as this. Perhaps the most important single -event was the formation of the non-profit RISC-V Foundation in August -2015. The Foundation will now take over stewardship of the official -RISC-V ISA standard, and the official website riscv.org is the best -place to obtain news and updates on the RISC-V standard.

-
-
-
-

40.6. Acknowledgments

-
-

Thanks to Scott Beamer, Allen J. Baum, Christopher Celio, David -Chisnall, Paul Clayton, Palmer Dabbelt, Jan Gray, Michael Hamburg, and -John Hauser for comments on the version 2.0 specification.

-
-
-
-

40.7. History from Revision 2.2

- -
-
-

40.8. Acknowledgments

-
-

Thanks to Jacob Bachmeyer, Alex Bradbury, David Horner, Stefan O’Rear, -and Joseph Myers for comments on the version 2.1 specification.

-
-
-
-

40.9. History for Revision 2.3

-
-

Uptake of RISC-V continues at a breakneck pace.

-
-
-

John Hauser and Andrew Waterman contributed a hypervisor ISA extension -based upon a proposal from Paolo Bonzini.

-
-
-

Daniel Lustig, Arvind, Krste Asanović, Shaked Flur, Paul Loewenstein, -Yatin Manerkar, Luc Maranget, Margaret Martonosi, Vijayanand Nagarajan, -Rishiyur Nikhil, Jonas Oberhauser, Christopher Pulte, Jose Renau, Peter -Sewell, Susmit Sarkar, Caroline Trippel, Muralidaran Vijayaraghavan, -Andrew Waterman, Derek Williams, Andrew Wright, and Sizhuo Zhang -contributed the memory consistency model.

-
-
-
-

40.10. Funding

-
-

Development of the RISC-V architecture and implementations has been -partially funded by the following sponsors.

-
-
-
    -
  • -

    Par Lab: Research supported by Microsoft (Award # 024263) and Intel -(Award # 024894) funding and by matching funding by U.C. Discovery (Award -# DIG07-10227). Additional support came from Par Lab affiliates Nokia, -NVIDIA, Oracle, and Samsung.

    -
  • -
  • -

    Project Isis: DoE Award DE-SC0003624.

    -
  • -
  • -

    ASPIRE Lab: DARPA PERFECT program, Award HR0011-12-2-0016. DARPA -POEM program Award HR0011-11-C-0100. The Center for Future Architectures -Research (C-FAR), a STARnet center funded by the Semiconductor Research -Corporation. Additional support from ASPIRE industrial sponsor, Intel, -and ASPIRE affiliates, Google, Hewlett Packard Enterprise, Huawei, -Nokia, NVIDIA, Oracle, and Samsung.

    -
  • -
-
-
-

The content of this paper does not necessarily reflect the position or -the policy of the US government and no official endorsement should be -inferred.

-
-
-
-
-
-

Appendix A: RVWMO Explanatory Material, Version 0.1

-
-
-

This section provides more explanation for RVWMO -Chapter 18, using more informal -language and concrete examples. These are intended to clarify the -meaning and intent of the axioms and preserved program order rules. This -appendix should be treated as commentary; all normative material is -provided in Chapter 18 and in the rest of -the main body of the ISA specification. All currently known -discrepancies are listed in Section A.7. Any -other discrepancies are unintentional.

-
-
-

A.1. Why RVWMO?

-
-

Memory consistency models fall along a loose spectrum from weak to -strong. Weak memory models allow more hardware implementation -flexibility and deliver arguably better performance, performance per -watt, power, scalability, and hardware verification overheads than -strong models, at the expense of a more complex programming model. -Strong models provide simpler programming models, but at the cost of -imposing more restrictions on the kinds of (non-speculative) hardware -optimizations that can be performed in the pipeline and in the memory -system, and in turn imposing some cost in terms of power, area overhead, -and verification burden.

-
-
-

RISC-V has chosen the RVWMO memory model, a variant of release -consistency. This places it in between the two extremes of the memory -model spectrum. The RVWMO memory model enables architects to build -simple implementations, aggressive implementations, implementations -embedded deeply inside a much larger system and subject to complex -memory system interactions, or any number of other possibilities, all -while simultaneously being strong enough to support programming language -memory models at high performance.

-
-
-

To facilitate the porting of code from other architectures, some -hardware implementations may choose to implement the Ztso extension, -which provides stricter RVTSO ordering semantics by default. Code -written for RVWMO is automatically and inherently compatible with RVTSO, -but code written assuming RVTSO is not guaranteed to run correctly on -RVWMO implementations. In fact, most RVWMO implementations will (and -should) simply refuse to run RVTSO-only binaries. Each implementation -must therefore choose whether to prioritize compatibility with RVTSO -code (e.g., to facilitate porting from x86) or whether to instead -prioritize compatibility with other RISC-V cores implementing RVWMO.

-
-
-

Some fences and/or memory ordering annotations in code written for RVWMO -may become redundant under RVTSO; the cost that the default of RVWMO -imposes on Ztso implementations is the incremental overhead of fetching -those fences (e.g., FENCE R,RW and FENCE RW,W) which become no-ops on -that implementation. However, these fences must remain present in the -code if compatibility with non-Ztso implementations is desired.

-
-
-
-

A.2. Litmus Tests

-
-

The explanations in this chapter make use of litmus tests, or small -programs designed to test or highlight one particular aspect of a memory -model. Litmus sample shows an example -of a litmus test with two harts. As a convention for this figure and for -all figures that follow in this chapter, we assume that s0-s2 are -pre-set to the same value in all harts and that s0 holds the address -labeled x, s1 holds y, and s2 holds z, where x, y, and z -are disjoint memory locations aligned to 8 byte boundaries. All other registers and all referenced memory locations are presumed to be initialized to zero. Each figure -shows the litmus test code on the left, and a visualization of one -particular valid or invalid execution on the right.

-
- - ---- - - - - - - -
Table 31. A sample litmus test and one forbidden execution (a0=1).
------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Hart 0Hart 1

li t1,1

li t4,4

(a)

sw t1,0(s0)

(e)

sw t4,0(s0)

li t2,2

(b)

sw t2,0(s0)

(c)

lw a0,0(s0)

li t3,3

li t5,5

(d)

sw t3,0(s0)

(f)

sw t5,0(s0)

--- - - - - - -
-
-litmus sample -
-
-
-

Litmus tests are used to understand the implications of the memory model -in specific concrete situations. For example, in the litmus test of -Litmus sample, the final value of a0 -in the first hart can be either 2, 4, or 5, depending on the dynamic -interleaving of the instruction stream from each hart at runtime. -However, in this example, the final value of a0 in Hart 0 will never -be 1 or 3; intuitively, the value 1 will no longer be visible at the -time the load executes, and the value 3 will not yet be visible by the -time the load executes. We analyze this test and many others below.

-
-
- - ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 32. A key for the litmus test diagrams drawn in this appendix
EdgeFull Name (and explanation)

rf

Reads From (from each store to the loads that return a value -written by that store)

co

Coherence (a total order on the stores to each address)

fr

From-Reads (from each load to co-successors of the store from which -the load returned a value)

ppo

Preserved Program Order

fence

Orderings enforced by a FENCE instruction

addr

Address Dependency

ctrl

Control Dependency

data

Data Dependency

-
-

The diagram shown to the right of each litmus test shows a visual -representation of the particular execution candidate being considered. -These diagrams use a notation that is common in the memory model -literature for constraining the set of possible global memory orders -that could produce the execution in question. It is also the basis for -the herd models presented in -Section B.2. This notation is explained in -Table 32. Of the listed relations, rf edges between -harts, co edges, fr edges, and ppo edges directly constrain the global -memory order (as do fence, addr, data, and some ctrl edges, via ppo). -Other edges (such as intra-hart rf edges) are informative but do not -constrain the global memory order.

-
-
-

For example, in Litmus sample, a0=1 -could occur only if one of the following were true:

-
-
-
    -
  • -

    (b) appears before (a) in global memory order (and in the -coherence order co). However, this violates RVWMO PPO -rule ppo:→st. The co edge from (b) to (a) highlights this -contradiction.

    -
  • -
  • -

    (a) appears before (b) in global memory order (and in the -coherence order co). However, in this case, the Load Value Axiom would -be violated, because (a) is not the latest matching store prior to (c) -in program order. The fr edge from (c) to (b) highlights this -contradiction.

    -
  • -
-
-
-

Since neither of these scenarios satisfies the RVWMO axioms, the outcome -a0=1 is forbidden.

-
-
-

Beyond what is described in this appendix, a suite of more than seven -thousand litmus tests is available at -github.com/litmus-tests/litmus-tests-riscv.

-
-
- - - - - -
- - -
-

The litmus tests repository also provides instructions on how to run the -litmus tests on RISC-V hardware and how to compare the results with the -operational and axiomatic models.

-
-
-

In the future, we expect to adapt these memory model litmus tests for -use as part of the RISC-V compliance test suite as well.

-
-
-
-
-
-

A.3. Explaining the RVWMO Rules

-
-

In this section, we provide explanation and examples for all of the -RVWMO rules and axioms.

-
-
-

A.3.1. Preserved Program Order and Global Memory Order

-
-

Preserved program order represents the subset of program order that must -be respected within the global memory order. Conceptually, events from -the same hart that are ordered by preserved program order must appear in -that order from the perspective of other harts and/or observers. Events -from the same hart that are not ordered by preserved program order, on -the other hand, may appear reordered from the perspective of other harts -and/or observers.

-
-
-

Informally, the global memory order represents the order in which loads -and stores perform. The formal memory model literature has moved away -from specifications built around the concept of performing, but the idea -is still useful for building up informal intuition. A load is said to -have performed when its return value is determined. A store is said to -have performed not when it has executed inside the pipeline, but rather -only when its value has been propagated to globally visible memory. In -this sense, the global memory order also represents the contribution of -the coherence protocol and/or the rest of the memory system to -interleave the (possibly reordered) memory accesses being issued by each -hart into a single total order agreed upon by all harts.

-
-
-

The order in which loads perform does not always directly correspond to -the relative age of the values those two loads return. In particular, a -load b may perform before another load a to -the same address (i.e., b may execute before -a, and b may appear before a -in the global memory order), but a may nevertheless return -an older value than b. This discrepancy captures (among -other things) the reordering effects of buffering placed between the -core and memory. For example, b may have returned a value -from a store in the store buffer, while a may have ignored -that younger store and read an older value from memory instead. To -account for this, at the time each load performs, the value it returns -is determined by the load value axiom, not just strictly by determining -the most recent store to the same address in the global memory order, as -described below.

-
-
-
-

A.3.2. Load value axiom

-
- - - - - -
- - -
-

Section 18.1.4.1: Each byte of each load i returns the value written -to that byte by the store that is the latest in global memory order among -the following stores:

-
-
-
    -
  1. -

    Stores that write that byte and that precede i in the global memory -order

    -
  2. -
  3. -

    Stores that write that byte and that precede i in program order

    -
  4. -
-
-
-
-
-

Preserved program order is not required to respect the ordering of a -store followed by a load to an overlapping address. This complexity -arises due to the ubiquity of store buffers in nearly all -implementations. Informally, the load may perform (return a value) by -forwarding from the store while the store is still in the store buffer, -and hence before the store itself performs (writes back to globally -visible memory). Any other hart will therefore observe the load as -performing before the store.

-
-
-

Consider the Table 33. When running this program on an implementation with -store buffers, it is possible to arrive at the final outcome a0=1, a1=0, a2=1, a3=0 as follows:

-
- - ---- - - - - - - -
Table 33. A store buffer forwarding litmus test (outcome permitted)
------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Hart 0Hart 1

li t1, 1

li t1, 1

(a) sw t1,0(s0)

(e) sw t1,0(s1)

(b) lw a0,0(s0)

(f) lw a2,0(s1)

(c) fence r,r

(g) fence r,r

(d) lw a1,0(s1)

(h) lw a3,0(s0)

Outcome: a0=1, a1=0, a2=1, a3=0

--- - - - - - -
-
-litmus sb fwd -
-
-
-
    -
  • -

    (a) executes and enters the first hart’s private store buffer

    -
  • -
  • -

    (b) executes and forwards its return value 1 from (a) in the -store buffer

    -
  • -
  • -

    (c) executes since all previous loads (i.e., (b)) have -completed

    -
  • -
  • -

    (d) executes and reads the value 0 from memory

    -
  • -
  • -

    (e) executes and enters the second hart’s private store buffer

    -
  • -
  • -

    (f) executes and forwards its return value 1 from (e) in the -store buffer

    -
  • -
  • -

    (g) executes since all previous loads (i.e., (f)) have -completed

    -
  • -
  • -

    (h) executes and reads the value 0 from memory

    -
  • -
  • -

    (a) drains from the first hart’s store buffer to memory

    -
  • -
  • -

    (e) drains from the second hart’s store buffer to memory

    -
  • -
-
-
-

Therefore, the memory model must be able to account for this behavior.

-
-
-

To put it another way, suppose the definition of preserved program order -did include the following hypothetical rule: memory access -a precedes memory access b in preserved -program order (and hence also in the global memory order) if -a precedes b in program order and -a and b are accesses to the same memory -location, a is a write, and b is a read. -Call this "Rule X". Then we get the following:

-
-
-
    -
  • -

    (a) precedes (b): by rule X

    -
  • -
  • -

    (b) precedes (d): by rule 4

    -
  • -
  • -

    (d) precedes (e): by the load value axiom. Otherwise, if (e) -preceded (d), then (d) would be required to return the value 1. (This is -a perfectly legal execution; it’s just not the one in question)

    -
  • -
  • -

    (e) precedes (f): by rule X

    -
  • -
  • -

    (f) precedes (h): by rule 4]

    -
  • -
  • -

    (h) precedes (a): by the load value axiom, as above.

    -
  • -
-
-
-

The global memory order must be a total order and cannot be cyclic, -because a cycle would imply that every event in the cycle happens before -itself, which is impossible. Therefore, the execution proposed above -would be forbidden, and hence the addition of rule X would forbid -implementations with store buffer forwarding, which would clearly be -undesirable.

-
-
-

Nevertheless, even if (b) precedes (a) and/or (f) precedes (e) in the -global memory order, the only sensible possibility in this example is -for (b) to return the value written by (a), and likewise for (f) and -(e). This combination of circumstances is what leads to the second -option in the definition of the load value axiom. Even though (b) -precedes (a) in the global memory order, (a) will still be visible to -(b) by virtue of sitting in the store buffer at the time (b) executes. -Therefore, even if (b) precedes (a) in the global memory order, (b) -should return the value written by (a) because (a) precedes (b) in -program order. Likewise for (e) and (f).

-
- - ---- - - - - - - -
Table 34. The "PPOCA" store buffer forwarding litmus test (outcome permitted)
------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Hart 0Hart 1

li t1, 1

li t1, 1

(a)

sw t1,0(s0)

LOOP:

(b)

fence w,w

(d)

lw a0,0(s1)

(c)

sw t1,0(s1)

beqz a0, LOOP

(e)

sw t1,0(s2)

(f)

lw a1,0(s2)

xor a2,a1,a1

add s0,s0,a2

(g)

lw a2,0(s0)

Outcome: a0=1, a1=1, a2=0

--- - - - - - -
-
-litmus ppoca -
-
-
-

Another test that highlights the behavior of store buffers is shown in -Table 34. In this example, (d) is -ordered before (e) because of the control dependency, and (f) is ordered -before (g) because of the address dependency. However, (e) is not -necessarily ordered before (f), even though (f) returns the value -written by (e). This could correspond to the following sequence of -events:

-
-
-
    -
  • -

    (e) executes speculatively and enters the second hart’s private -store buffer (but does not drain to memory)

    -
  • -
  • -

    (f) executes speculatively and forwards its return value 1 from -(e) in the store buffer

    -
  • -
  • -

    (g) executes speculatively and reads the value 0 from memory

    -
  • -
  • -

    (a) executes, enters the first hart’s private store buffer, and -drains to memory

    -
  • -
  • -

    (b) executes and retires

    -
  • -
  • -

    (c) executes, enters the first hart’s private store buffer, and -drains to memory

    -
  • -
  • -

    (d) executes and reads the value 1 from memory

    -
  • -
  • -

    (e), (f), and (g) commit, since the speculation turned out to be -correct

    -
  • -
  • -

    (e) drains from the store buffer to memory

    -
  • -
-
-
-
-

A.3.3. Atomicity axiom

-
- - - - - -
- - -
-

Atomicity Axiom (for Aligned Atomics): If r and w are paired load and -store operations generated by aligned LR and SC instructions in a hart -h, s is a store to byte x, and r returns a value written by s, then s must -precede w in the global memory order, and there can be no store from -a hart other than h to byte x following s and preceding w in the global -memory order.

-
-
-
-
-

The RISC-V architecture decouples the notion of atomicity from the -notion of ordering. Unlike architectures such as TSO, RISC-V atomics -under RVWMO do not impose any ordering requirements by default. Ordering -semantics are only guaranteed by the PPO rules that otherwise apply.

-
-
-

RISC-V contains two types of atomics: AMOs and LR/SC pairs. These -conceptually behave differently, in the following way. LR/SC behave as -if the old value is brought up to the core, modified, and written back -to memory, all while a reservation is held on that memory location. AMOs -on the other hand conceptually behave as if they are performed directly -in memory. AMOs are therefore inherently atomic, while LR/SC pairs are -atomic in the slightly different sense that the memory location in -question will not be modified by another hart during the time the -original hart holds the reservation.

-
- ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
(a) lr.d a0, 0(s0)(a) lr.d a0, 0(s0)(a) lr.w a0, 0(s0)(a) lr.w a0, 0(s0)

(b) sd t1, 0(s0)

(b) sw t1, 4(s0)

(b) sw t1, 4(s0)

(b) sw t1, 4(s0)

(c) sc.d t3, t2, 0(s0)

(c) sc.d t3, t2, 0(s0)

(c) sc.w t3, t2, 0(s0)

(c) addi s0, s0, 8

(d) sc.w t3, t2, 8(s0)

-
-

Figure 4: In all four (independent) instances, the final store-conditional instruction is permitted but not guaranteed to succeed.

-
-
-

The atomicity axiom forbids stores from other harts from being -interleaved in global memory order between an LR and the SC paired with -that LR. The atomicity axiom does not forbid loads from being -interleaved between the paired operations in program order or in the -global memory order, nor does it forbid stores from the same hart or -stores to non-overlapping locations from appearing between the paired -operations in either program order or in the global memory order. For -example, the SC instructions in [litmus_lrsdsc] may (but are not -guaranteed to) succeed. None of those successes would violate the -atomicity axiom, because the intervening non-conditional stores are from -the same hart as the paired load-reserved and store-conditional -instructions. This way, a memory system that tracks memory accesses at -cache line granularity (and which therefore will see the four snippets -of [litmus_lrsdsc] as identical) will not -be forced to fail a store-conditional instruction that happens to -(falsely) share another portion of the same cache line as the memory -location being held by the reservation.

-
-
-

The atomicity axiom also technically supports cases in which the LR and -SC touch different addresses and/or use different access sizes; however, -use cases for such behaviors are expected to be rare in practice. -Likewise, scenarios in which stores from the same hart between an LR/SC -pair actually overlap the memory location(s) referenced by the LR or SC -are expected to be rare compared to scenarios where the intervening -store may simply fall onto the same cache line.

-
-
-
-

A.3.4. Progress axiom

-
- - - - - -
- - -
-

Progress Axiom: No memory operation may be preceded in the global -memory order by an infinite sequence of other memory operations.

-
-
-
-
-

The progress axiom ensures a minimal forward progress guarantee. It -ensures that stores from one hart will eventually be made visible to -other harts in the system in a finite amount of time, and that loads -from other harts will eventually be able to read those values (or -successors thereof). Without this rule, it would be legal, for example, -for a spinlock to spin infinitely on a value, even with a store from -another hart waiting to unlock the spinlock.

-
-
-

The progress axiom is intended not to impose any other notion of -fairness, latency, or quality of service onto the harts in a RISC-V -implementation. Any stronger notions of fairness are up to the rest of -the ISA and/or up to the platform and/or device to define and implement.

-
-
-

The forward progress axiom will in almost all cases be naturally -satisfied by any standard cache coherence protocol. Implementations with -non-coherent caches may have to provide some other mechanism to ensure -the eventual visibility of all stores (or successors thereof) to all -harts.

-
-
-
-

A.3.5. Overlapping-Address Orderings (Rules 1-3)

-
- - - - - -
- - -
-

Rule 1: b is a store, and a and b access overlapping memory addresses

-
-
-

Rule 2: a and b are loads, x is a byte read by both a and b, there is no -store to x between a and b in program order, and a and b return values -for x written by different memory operations

-
-
-

Rule 3: a is generated by an AMO or SC instruction, b is a load, and b -returns a value written by a

-
-
-
-
-

Same-address orderings where the latter is a store are straightforward: -a load or store can never be reordered with a later store to an -overlapping memory location. From a microarchitecture perspective, -generally speaking, it is difficult or impossible to undo a -speculatively reordered store if the speculation turns out to be -invalid, so such behavior is simply disallowed by the model. -Same-address orderings from a store to a later load, on the other hand, -do not need to be enforced. As discussed in -Load value axiom, this reflects the observable -behavior of implementations that forward values from buffered stores to -later loads.

-
-
-

Same-address load-load ordering requirements are far more subtle. The -basic requirement is that a younger load must not return a value that is -older than a value returned by an older load in the same hart to the -same address. This is often known as "CoRR" (Coherence for Read-Read -pairs), or as part of a broader "coherence" or "sequential -consistency per location" requirement. Some architectures in the past -have relaxed same-address load-load ordering, but in hindsight this is -generally considered to complicate the programming model too much, and -so RVWMO requires CoRR ordering to be enforced. However, because the -global memory order corresponds to the order in which loads perform -rather than the ordering of the values being returned, capturing CoRR -requirements in terms of the global memory order requires a bit of -indirection.

-
- - ---- - - - - - - -
Table 35. Litmus test MP+fence.w.w+fre-rfi-addr (outcome permitted)
------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Hart 0Hart 1

li t1, 1

li t2, 2

(a)

sw t1,0(s0)

(d)

lw a0,0(s1)

(b)

fence w, w

(e)

sw t2,0(s1)

(c)

sw t1,0(s1)

(f)

lw a1,0(s1)

(g)

xor t3,a1,a1

(h)

add s0,s0,t3

(i)

lw a2,0(s0)

Outcome: a0=1, a1=2, a2=0

--- - - - - - -
-
-litmus mp fenceww fri rfi addr -
-
-
-

Consider the litmus test of Table 35, which is one particular -instance of the more general "fri-rfi" pattern. The term "fri-rfi" -refers to the sequence (d), (e), (f): (d) "from-reads" (i.e., reads -from an earlier write than) (e) which is the same hart, and (f) reads -from (e) which is in the same hart.

-
-
-

From a microarchitectural perspective, outcome a0=1, a1=2, a2=0 is -legal (as are various other less subtle outcomes). Intuitively, the -following would produce the outcome in question:

-
-
-
    -
  • -

    (d) stalls (for whatever reason; perhaps it’s stalled waiting -for some other preceding instruction)

    -
  • -
  • -

    (e) executes and enters the store buffer (but does not yet -drain to memory)

    -
  • -
  • -

    (f) executes and forwards from (e) in the store buffer

    -
  • -
  • -

    (g), (h), and (i) execute

    -
  • -
  • -

    (a) executes and drains to memory, (b) executes, and (c) -executes and drains to memory

    -
  • -
  • -

    (d) unstalls and executes

    -
  • -
  • -

    (e) drains from the store buffer to memory

    -
  • -
-
-
-

This corresponds to a global memory order of (f), (i), (a), (c), (d), -(e). Note that even though (f) performs before (d), the value returned -by (f) is newer than the value returned by (d). Therefore, this -execution is legal and does not violate the CoRR requirements.

-
-
-

Likewise, if two back-to-back loads return the values written by the -same store, then they may also appear out-of-order in the global memory -order without violating CoRR. Note that this is not the same as saying -that the two loads return the same value, since two different stores may -write the same value.

-
- - ---- - - - - - - -
Table 36. Litmus test RSW (outcome permitted)
------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Hart 0Hart 1

li t1, 1

(d)

lw a0,0(s1)

(a)

sw t1,0(s0)

(e)

xor t2,a0,a0

(b)

fence w, w

(f)

add s4,s2,t2

(c)

sw t1,0(s1)

(g)

lw a1,0(s4)

(h)

lw a2,0(s2)

(i)

xor t3,a2,a2

(j)

add s0,s0,t3

(k)

lw a3,0(s0)

Outcome: a0=1, a1=v, a2=v, a3=0

--- - - - - - -
-
-litmus rsw -
-
-
-

Consider the litmus test of Table 36. -The outcome a0=1, a1=v, a2=v, a3=0 (where v is -some value written by another hart) can be observed by allowing (g) and -(h) to be reordered. This might be done speculatively, and the -speculation can be justified by the microarchitecture (e.g., by snooping -for cache invalidations and finding none) because replaying (h) after -(g) would return the value written by the same store anyway. Hence -assuming a1 and a2 would end up with the same value written by the -same store anyway, (g) and (h) can be legally reordered. The global -memory order corresponding to this execution would be -(h),(k),(a),(c),(d),(g).

-
-
-

Executions of the test in Table 36 in -which a1 does not equal a2 do in fact require that (g) appears -before (h) in the global memory order. Allowing (h) to appear before (g) -in the global memory order would in that case result in a violation of -CoRR, because then (h) would return an older value than that returned by -(g). Therefore, rule 2 forbids this CoRR violation -from occurring. As such, rule 2 strikes a careful -balance between enforcing CoRR in all cases while simultaneously being -weak enough to permit "RSW" and "fri-rfi" patterns that commonly -appear in real microarchitectures.

-
-
-

There is one more overlapping-address rule: rule 3 simply states that a value cannot -be returned from an AMO or SC to a subsequent load until the AMO or SC -has (in the case of the SC, successfully) performed globally. This -follows somewhat naturally from the conceptual view that both AMOs and -SC instructions are meant to be performed atomically in memory. However, -notably, rule 3 states that hardware -may not even non-speculatively forward the value being stored by an -AMOSWAP to a subsequent load, even though for AMOSWAP that store value -is not actually semantically dependent on the previous value in memory, -as is the case for the other AMOs. The same holds true even when -forwarding from SC store values that are not semantically dependent on -the value returned by the paired LR.

-
-
-

The three PPO rules above also apply when the memory accesses in -question only overlap partially. This can occur, for example, when -accesses of different sizes are used to access the same object. Note -also that the base addresses of two overlapping memory operations need -not necessarily be the same for two memory accesses to overlap. When -misaligned memory accesses are being used, the overlapping-address PPO -rules apply to each of the component memory accesses independently.

-
-
-
-

A.3.6. Fences (Rule 4)

-
- - - - - -
- - -
-

Rule 4: There is a FENCE instruction that orders a before b

-
-
-
-
-

By default, the FENCE instruction ensures that all memory accesses from -instructions preceding the fence in program order (the "predecessor -set") appear earlier in the global memory order than memory accesses -from instructions appearing after the fence in program order (the -"successor set"). However, fences can optionally further restrict the -predecessor set and/or the successor set to a smaller set of memory -accesses in order to provide some speedup. Specifically, fences have PR, -PW, SR, and SW bits which restrict the predecessor and/or successor -sets. The predecessor set includes loads (resp.stores) if and only if PR -(resp.PW) is set. Similarly, the successor set includes loads -(resp.stores) if and only if SR (resp.SW) is set.

-
-
-

The FENCE encoding currently has nine non-trivial combinations of the -four bits PR, PW, SR, and SW, plus one extra encoding FENCE.TSO which -facilitates mapping of "acquire+release" or RVTSO semantics. The -remaining seven combinations have empty predecessor and/or successor -sets and hence are no-ops. Of the ten non-trivial options, only six are -commonly used in practice:

-
-
-
    -
  • -

    FENCE RW,RW

    -
  • -
  • -

    FENCE.TSO

    -
  • -
  • -

    FENCE RW,W

    -
  • -
  • -

    FENCE R,RW

    -
  • -
  • -

    FENCE R,R

    -
  • -
  • -

    FENCE W,W

    -
  • -
-
-
-

FENCE instructions using any other combination of PR, PW, SR, and SW are -reserved. We strongly recommend that programmers stick to these six. -Other combinations may have unknown or unexpected interactions with the -memory model.

-
-
-

Finally, we note that since RISC-V uses a multi-copy atomic memory -model, programmers can reason about fences bits in a thread-local -manner. There is no complex notion of "fence cumulativity" as found in -memory models that are not multi-copy atomic.

-
-
-
-

A.3.7. Explicit Synchronization (Rules 5-8)

-
- - - - - -
- - -
-

Rule 5: a has an acquire annotation

-
-
-

Rule 6: b has a release annotation

-
-
-

Rule 7: a and b both have RCsc annotations

-
-
-

Rule 8: a is paired with b

-
-
-
-
-

An acquire operation, as would be used at the start of a critical -section, requires all memory operations following the acquire in program -order to also follow the acquire in the global memory order. This -ensures, for example, that all loads and stores inside the critical -section are up to date with respect to the synchronization variable -being used to protect it. Acquire ordering can be enforced in one of two -ways: with an acquire annotation, which enforces ordering with respect -to just the synchronization variable itself, or with a FENCE R,RW, which -enforces ordering with respect to all previous loads.

-
-
-
Listing 2. A spinlock with atomics
-
-
          sd           x1, (a1)     # Arbitrary unrelated store
-          ld           x2, (a2)     # Arbitrary unrelated load
-          li           t0, 1        # Initialize swap value.
-      again:
-          amoswap.w.aq t0, t0, (a0) # Attempt to acquire lock.
-          bnez         t0, again    # Retry if held.
-          # ...
-          # Critical section.
-          # ...
-          amoswap.w.rl x0, x0, (a0) # Release lock by storing 0.
-          sd           x3, (a3)     # Arbitrary unrelated store
-          ld           x4, (a4)     # Arbitrary unrelated load
-
-
-
-

Consider Example 1. -Because this example uses aq, the loads and stores in the critical -section are guaranteed to appear in the global memory order after the -AMOSWAP used to acquire the lock. However, assuming a0, a1, and a2 -point to different memory locations, the loads and stores in the -critical section may or may not appear after the "Arbitrary unrelated -load" at the beginning of the example in the global memory order.

-
-
-
Listing 3. A spinlock with fences
-
-
          sd           x1, (a1)     # Arbitrary unrelated store
-          ld           x2, (a2)     # Arbitrary unrelated load
-          li           t0, 1        # Initialize swap value.
-      again:
-          amoswap.w    t0, t0, (a0) # Attempt to acquire lock.
-          fence        r, rw        # Enforce "acquire" memory ordering
-          bnez         t0, again    # Retry if held.
-          # ...
-          # Critical section.
-          # ...
-          fence        rw, w        # Enforce "release" memory ordering
-          amoswap.w    x0, x0, (a0) # Release lock by storing 0.
-          sd           x3, (a3)     # Arbitrary unrelated store
-          ld           x4, (a4)     # Arbitrary unrelated load
-
-
-
-

Now, consider the alternative in Example 2. In -this case, even though the AMOSWAP does not enforce ordering with an -aq bit, the fence nevertheless enforces that the acquire AMOSWAP -appears earlier in the global memory order than all loads and stores in -the critical section. Note, however, that in this case, the fence also -enforces additional orderings: it also requires that the "Arbitrary -unrelated load" at the start of the program appears earlier in the -global memory order than the loads and stores of the critical section. -(This particular fence does not, however, enforce any ordering with -respect to the "Arbitrary unrelated store" at the start of the -snippet.) In this way, fence-enforced orderings are slightly coarser -than orderings enforced by .aq.

-
-
-

Release orderings work exactly the same as acquire orderings, just in -the opposite direction. Release semantics require all loads and stores -preceding the release operation in program order to also precede the -release operation in the global memory order. This ensures, for example, -that memory accesses in a critical section appear before the -lock-releasing store in the global memory order. Just as for acquire -semantics, release semantics can be enforced using release annotations -or with a FENCE RW,W operation. Using the same examples, the ordering -between the loads and stores in the critical section and the "Arbitrary -unrelated store" at the end of the code snippet is enforced only by the -FENCE RW,W in Example 2, not by -the rl in Example 1.

-
-
-

With RCpc annotations alone, store-release-to-load-acquire ordering is -not enforced. This facilitates the porting of code written under the TSO -and/or RCpc memory models. To enforce store-release-to-load-acquire -ordering, the code must use store-release-RCsc and load-acquire-RCsc -operations so that PPO rule 7 applies. RCpc alone is -sufficient for many use cases in C/C but is insufficient for many -other use cases in C/C, Java, and Linux, to name just a few examples; -see Memory Porting for details.

-
-
-

PPO rule 8 indicates that an SC must appear after -its paired LR in the global memory order. This will follow naturally -from the common use of LR/SC to perform an atomic read-modify-write -operation due to the inherent data dependency. However, PPO -rule 8 also applies even when the value being stored -does not syntactically depend on the value returned by the paired LR.

-
-
-

Lastly, we note that just as with fences, programmers need not worry -about "cumulativity" when analyzing ordering annotations.

-
-
-
-

A.3.8. Syntactic Dependencies (Rules 9-11)

-
- - - - - -
- - -
-

Rule 9: b has a syntactic address dependency on a

-
-
-

Rule 10: b has a syntactic data dependency on a

-
-
-

Rule 11: b is a store, and b has a syntactic control dependency on a

-
-
-
-
-

Dependencies from a load to a later memory operation in the same hart -are respected by the RVWMO memory model. The Alpha memory model was -notable for choosing not to enforce the ordering of such dependencies, -but most modern hardware and software memory models consider allowing -dependent instructions to be reordered too confusing and -counterintuitive. Furthermore, modern code sometimes intentionally uses -such dependencies as a particularly lightweight ordering enforcement -mechanism.

-
-
-

The terms in Section 18.1.2 work as follows. Instructions -are said to carry dependencies from their -source register(s) to their destination register(s) whenever the value -written into each destination register is a function of the source -register(s). For most instructions, this means that the destination -register(s) carry a dependency from all source register(s). However, -there are a few notable exceptions. In the case of memory instructions, -the value written into the destination register ultimately comes from -the memory system rather than from the source register(s) directly, and -so this breaks the chain of dependencies carried from the source -register(s). In the case of unconditional jumps, the value written into -the destination register comes from the current pc (which is never -considered a source register by the memory model), and so likewise, JALR -(the only jump with a source register) does not carry a dependency from -rs1 to rd.

-
-
-
Listing 4. (c) has a syntactic dependency on both (a) and (b) via fflags, a destination register that both (a) and (b) implicitly accumulate into
-
-
(a) fadd f3,f1,f2
-(b) fadd f6,f4,f5
-(c) csrrs a0,fflags,x0
-
-
-
-

The notion of accumulating into a destination register rather than -writing into it reflects the behavior of CSRs such as fflags. In -particular, an accumulation into a register does not clobber any -previous writes or accumulations into the same register. For example, in -Listing 4, (c) has a syntactic dependency on both (a) and (b).

-
-
-

Like other modern memory models, the RVWMO memory model uses syntactic -rather than semantic dependencies. In other words, this definition -depends on the identities of the registers being accessed by different -instructions, not the actual contents of those registers. This means -that an address, control, or data dependency must be enforced even if -the calculation could seemingly be optimized away. This choice -ensures that RVWMO remains compatible with code that uses these false -syntactic dependencies as a lightweight ordering mechanism.

-
-
-
Listing 5. A syntactic address dependency
-
-
ld a1,0(s0)
-xor a2,a1,a1
-add s1,s1,a2
-ld a5,0(s1)
-
-
-
-

For example, there is a syntactic address dependency from the memory -operation generated by the first instruction to the memory operation -generated by the last instruction in -Listing 5, even though a1 XOR -a1 is zero and hence has no effect on the address accessed by the -second load.

-
-
-

The benefit of using dependencies as a lightweight synchronization -mechanism is that the ordering enforcement requirement is limited only -to the specific two instructions in question. Other non-dependent -instructions may be freely reordered by aggressive implementations. One -alternative would be to use a load-acquire, but this would enforce -ordering for the first load with respect to all subsequent -instructions. Another would be to use a FENCE R,R, but this would -include all previous and all subsequent loads, making this option more -expensive.

-
-
-
Listing 6. A syntactic control dependency
-
-
lw x1,0(x2)
-bne x1,x0,next
-sw x3,0(x4)
-next: sw x5,0(x6)
-
-
-
-

Control dependencies behave differently from address and data -dependencies in the sense that a control dependency always extends to -all instructions following the original target in program order. -Consider Listing 6 the -instruction at next will always execute, but the memory operation -generated by that last instruction nevertheless still has a control -dependency from the memory operation generated by the first instruction.

-
-
-
Listing 7. Another syntactic control dependency
-
-
lw x1,0(x2)
-bne x1,x0,next
-next: sw x3,0(x4)
-
-
-
-

Likewise, consider Listing 7. -Even though both branch outcomes have the same target, there is still a -control dependency from the memory operation generated by the first -instruction in this snippet to the memory operation generated by the -last instruction. This definition of control dependency is subtly -stronger than what might be seen in other contexts (e.g., C++), but it -conforms with standard definitions of control dependencies in the -literature.

-
-
-

Notably, PPO rules 9-11 are also -intentionally designed to respect dependencies that originate from the -output of a successful store-conditional instruction. Typically, an SC -instruction will be followed by a conditional branch checking whether -the outcome was successful; this implies that there will be a control -dependency from the store operation generated by the SC instruction to -any memory operations following the branch. PPO -rule 11 in turn implies that any subsequent store -operations will appear later in the global memory order than the store -operation generated by the SC. However, since control, address, and data -dependencies are defined over memory operations, and since an -unsuccessful SC does not generate a memory operation, no order is -enforced between unsuccessful SC and its dependent instructions. -Moreover, since SC is defined to carry dependencies from its source -registers to rd only when the SC is successful, an unsuccessful SC has -no effect on the global memory order.

-
- - ---- - - - - - - -
Table 37. A variant of the LB litmus test (outcome forbidden)
------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Initial values: 0(s0)=1; 0(s2)=1

Hart 0

Hart 1

(a)

ld a0,0(s0)

(e)

ld a3,0(s2)

(b)

lr a1,0(s1)

(f)

sd a3,0(s0)

(c)

sc a2,a0,0(s1)

(d)

sd a2,0(s2)

Outcome: a0=0, a3=0

--- - - - - - -
-
-litmus lb lrsc -
-
-
-

In addition, the choice to respect dependencies originating at -store-conditional instructions ensures that certain out-of-thin-air-like -behaviors will be prevented. Consider -Table 37. Suppose a -hypothetical implementation could occasionally make some early guarantee -that a store-conditional operation will succeed. In this case, (c) could -return 0 to a2 early (before actually executing), allowing the -sequence (d), (e), (f), (a), and then (b) to execute, and then (c) might -execute (successfully) only at that point. This would imply that (c) -writes its own success value to 0(s1)! Fortunately, this situation and -others like it are prevented by the fact that RVWMO respects -dependencies originating at the stores generated by successful SC -instructions.

-
-
-

We also note that syntactic dependencies between instructions only have -any force when they take the form of a syntactic address, control, -and/or data dependency. For example: a syntactic dependency between two -F instructions via one of the accumulating CSRs in -Section 18.3 does not imply -that the two F instructions must be executed in order. Such a -dependency would only serve to ultimately set up later a dependency from -both F instructions to a later CSR instruction accessing the CSR -flag in question.

-
-
-
-

A.3.9. Pipeline Dependencies (Rules 12-13)

-
- - - - - -
- - -
-

Rule 12: b is a load, and there exists some store m between a and b in -program order such that m has an address or data dependency on a, -and b returns a value written by m

-
-
-

Rule 13: b is a store, and there exists some instruction m between a and -b in program order such that m has an address dependency on a

-
-
-
- - ---- - - - - - - -
Table 38. Because of PPO rule 12 and the data dependency from (d) to (e), (d) must also precede (f) in the global memory order (outcome forbidden)
------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Hart 0Hart 1

li t1, 1

(d)

lw a0, 0(s1)

(a)

sw t1,0(s0)

(e)

sw a0, 0(s2)

(b)

fence w, w

(f)

lw a1, 0(s2)

(c)

sw t1,0(s1)

xor a2,a1,a1

add s0,s0,a2

(g)

lw a3,0(s0)

Outcome: a0=1, a3=0

--- - - - - - -
-
-litmus datarfi -
-
-
-

PPO rules 12 and 13 reflect behaviors of almost all real processor -pipeline implementations. Rule 12 -states that a load cannot forward from a store until the address and -data for that store are known. Consider Table 38 (f) cannot be -executed until the data for (e) has been resolved, because (f) must -return the value written by (e) (or by something even later in the -global memory order), and the old value must not be clobbered by the -writeback of (e) before (d) has had a chance to perform. Therefore, (f) -will never perform before (d) has performed.

-
- - ---- - - - - - - -
Table 39. Because of the extra store between (e) and (g), (d) no longer necessarily precedes (g) (outcome permitted)
------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Hart 0Hart 1

li t1, 1

li t1, 1

(a)

sw t1,0(s0)

(d)

lw a0, 0(s1)

(b)

fence w, w

(e)

sw a0, 0(s2)

(c)

sw t1,0(s1)

(f)

sw t1, 0(s2)

(g)

lw a1, 0(s2)

xor a2,a1,a1

add s0,s0,a2

(h)

lw a3,0(s0)

Outcome: a0=1, a3=0

--- - - - - - -
-
-litmus datacoirfi -
-
-
-

If there were another store to the same address in between (e) and (f), -as in Table 40, -then (f) would no longer be dependent on the data of (e) being resolved, -and hence the dependency of (f) on (d), which produces the data for (e), -would be broken.

-
-
-

Rule13 makes a similar observation to the -previous rule: a store cannot be performed at memory until all previous -loads that might access the same address have themselves been performed. -Such a load must appear to execute before the store, but it cannot do so -if the store were to overwrite the value in memory before the load had a -chance to read the old value. Likewise, a store generally cannot be -performed until it is known that preceding instructions will not cause -an exception due to failed address resolution, and in this sense, -rule 13 can be seen as somewhat of a special case -of rule 11.

-
- - ---- - - - - - - -
Table 40. Because of the address dependency from (d) to (e), (d) also precedes (f) (outcome forbidden)
------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Hart 0Hart 1

li t1, 1

(a)

lw a0,0(s0)

(d)

lw a1, 0(s1)

(b)

fence rw,rw

(e)

lw a2, 0(a1)

(c)

sw s2,0(s1)

(f)

sw t1, 0(s0)

Outcome: a0=1, a1=t

--- - - - - - -
-

litmus addrpo

-
-
-

Consider Table 40 (f) cannot be -executed until the address for (e) is resolved, because it may turn out -that the addresses match; i.e., that a1=s0. Therefore, (f) cannot be -sent to memory before (d) has executed and confirmed whether the -addresses do indeed overlap.

-
-
-
-
-

A.4. Beyond Main Memory

-
-

RVWMO does not currently attempt to formally describe how FENCE.I, -SFENCE.VMA, I/O fences, and PMAs behave. All of these behaviors will be -described by future formalizations. In the meantime, the behavior of -FENCE.I is described in Chapter 6, the -behavior of SFENCE.VMA is described in the RISC-V Instruction Set -Privileged Architecture Manual, and the behavior of I/O fences and the -effects of PMAs are described below.

-
-
-

A.4.1. Coherence and Cacheability

-
-

The RISC-V Privileged ISA defines Physical Memory Attributes (PMAs) -which specify, among other things, whether portions of the address space -are coherent and/or cacheable. See the RISC-V Privileged ISA -Specification for the complete details. Here, we simply discuss how the -various details in each PMA relate to the memory model:

-
-
-
    -
  • -

    Main memory vs.I/O, and I/O memory ordering PMAs: the memory model as -defined applies to main memory regions. I/O ordering is discussed below.

    -
  • -
  • -

    Supported access types and atomicity PMAs: the memory model is simply -applied on top of whatever primitives each region supports.

    -
  • -
  • -

    Cacheability PMAs: the cacheability PMAs in general do not affect the -memory model. Non-cacheable regions may have more restrictive behavior -than cacheable regions, but the set of allowed behaviors does not change -regardless. However, some platform-specific and/or device-specific -cacheability settings may differ.

    -
  • -
  • -

    Coherence PMAs: The memory consistency model for memory regions marked -as non-coherent in PMAs is currently platform-specific and/or -device-specific: the load-value axiom, the atomicity axiom, and the -progress axiom all may be violated with non-coherent memory. Note -however that coherent memory does not require a hardware cache coherence -protocol. The RISC-V Privileged ISA Specification suggests that -hardware-incoherent regions of main memory are discouraged, but the -memory model is compatible with hardware coherence, software coherence, -implicit coherence due to read-only memory, implicit coherence due to -only one agent having access, or otherwise.

    -
  • -
  • -

    Idempotency PMAs: Idempotency PMAs are used to specify memory regions -for which loads and/or stores may have side effects, and this in turn is -used by the microarchitecture to determine, e.g., whether prefetches are -legal. This distinction does not affect the memory model.

    -
  • -
-
-
-
-

A.4.2. I/O Ordering

-
-

For I/O, the load value axiom and atomicity axiom in general do not -apply, as both reads and writes might have device-specific side effects -and may return values other than the value "written" by the most -recent store to the same address. Nevertheless, the following preserved -program order rules still generally apply for accesses to I/O memory: -memory access a precedes memory access b in -global memory order if a precedes b in -program order and one or more of the following holds:

-
-
-
    -
  1. -

    a precedes b in preserved program order as -defined in Chapter 18, with the exception -that acquire and release ordering annotations apply only from one memory -operation to another memory operation and from one I/O operation to -another I/O operation, but not from a memory operation to an I/O nor -vice versa

    -
  2. -
  3. -

    a and b are accesses to overlapping -addresses in an I/O region

    -
  4. -
  5. -

    a and b are accesses to the same strongly -ordered I/O region

    -
  6. -
  7. -

    a and b are accesses to I/O regions, and -the channel associated with the I/O region accessed by either -a or b is channel 1

    -
  8. -
  9. -

    a and b are accesses to I/O regions -associated with the same channel (except for channel 0)

    -
  10. -
-
-
-

Note that the FENCE instruction distinguishes between main memory -operations and I/O operations in its predecessor and successor sets. To -enforce ordering between I/O operations and main memory operations, code -must use a FENCE with PI, PO, SI, and/or SO, plus PR, PW, SR, and/or SW. -For example, to enforce ordering between a write to main memory and an -I/O write to a device register, a FENCE W,O or stronger is needed.

-
-
-
Listing 8. Ordering memory and I/O accesses
-
-
sd t0, 0(a0)
-fence w,o
-sd a0, 0(a1)
-
-
-
-

When a fence is in fact used, implementations must assume that the -device may attempt to access memory immediately after receiving the MMIO -signal, and subsequent memory accesses from that device to memory must -observe the effects of all accesses ordered prior to that MMIO -operation. In other words, in Listing 8, -suppose 0(a0) is in main memory and 0(a1) is the address of a device -register in I/O memory. If the device accesses 0(a0) upon receiving -the MMIO write, then that load must conceptually appear after the first -store to 0(a0) according to the rules of the RVWMO memory model. In -some implementations, the only way to ensure this will be to require -that the first store does in fact complete before the MMIO write is -issued. Other implementations may find ways to be more aggressive, while -others still may not need to do anything different at all for I/O and -main memory accesses. Nevertheless, the RVWMO memory model does not -distinguish between these options; it simply provides an -implementation-agnostic mechanism to specify the orderings that must be -enforced.

-
-
-

Many architectures include separate notions of "ordering" and -`completion" fences, especially as it relates to I/O (as opposed to -regular main memory). Ordering fences simply ensure that memory -operations stay in order, while completion fences ensure that -predecessor accesses have all completed before any successors are made -visible. RISC-V does not explicitly distinguish between ordering and -completion fences. Instead, this distinction is simply inferred from -different uses of the FENCE bits.

-
-
-

For implementations that conform to the RISC-V Unix Platform -Specification, I/O devices and DMA operations are required to access -memory coherently and via strongly ordered I/O channels. Therefore, -accesses to regular main memory regions that are concurrently accessed -by external devices can also use the standard synchronization -mechanisms. Implementations that do not conform to the Unix Platform -Specification and/or in which devices do not access memory coherently -will need to use mechanisms (which are currently platform-specific or -device-specific) to enforce coherency.

-
-
-

I/O regions in the address space should be considered non-cacheable -regions in the PMAs for those regions. Such regions can be considered -coherent by the PMA if they are not cached by any agent.

-
-
-

The ordering guarantees in this section may not apply beyond a -platform-specific boundary between the RISC-V cores and the device. In -particular, I/O accesses sent across an external bus (e.g., PCIe) may be -reordered before they reach their ultimate destination. Ordering must be -enforced in such situations according to the platform-specific rules of -those external devices and buses.

-
-
-
-
-

A.5. Code Porting and Mapping Guidelines

- - ---- - - - - - - - - - - - - - - - - - - - - - - - - -
Table 41. Mappings from TSO operations to RISC-V operations
x86/TSO OperationRVWMO Mapping

Load

l{b|h|w|d}; fence r,rw

Store

fence rw,w; s{b|h|w|d}

Atomic RMW

amo<op>.{w|d}.aqrl OR
-loop:lr.{w|d}.aq; <op>; sc.{w|d}.aqrl; bnez loop

Fence

fence rw,rw

-
-

Table 41 provides a mapping from TSO memory -operations onto RISC-V memory instructions. Normal x86 loads and stores -are all inherently acquire-RCpc and release-RCpc operations: TSO -enforces all load-load, load-store, and store-store ordering by default. -Therefore, under RVWMO, all TSO loads must be mapped onto a load -followed by FENCE R,RW, and all TSO stores must be mapped onto -FENCE RW,W followed by a store. TSO atomic read-modify-writes and x86 -instructions using the LOCK prefix are fully ordered and can be -implemented either via an AMO with both aq and rl set, or via an LR -with aq set, the arithmetic operation in question, an SC with both -aq and rl set, and a conditional branch checking the success -condition. In the latter case, the rl annotation on the LR turns out -(for non-obvious reasons) to be redundant and can be omitted.

-
-
-

Alternatives to Table 41 are also possible. A TSO -store can be mapped onto AMOSWAP with rl set. However, since RVWMO PPO -Rule 3 forbids forwarding of values from -AMOs to subsequent loads, the use of AMOSWAP for stores may negatively -affect performance. A TSO load can be mapped using LR with aq set: all -such LR instructions will be unpaired, but that fact in and of itself -does not preclude the use of LR for loads. However, again, this mapping -may also negatively affect performance if it puts more pressure on the -reservation mechanism than was originally intended.

-
- - ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 42. Mappings from Power operations to RISC-V operations
Power OperationRVWMO Mapping

Load

l{b|h|w|d}

Load-Reserve

lr.{w|d}

Store

s{b|h|w|d}

Store-Conditional

sc.{w|d}

lwsync

fence.tso

sync

fence rw,rw

isync

fence.i; fence r,r

-
-

Table 42 provides a mapping from Power memory -operations onto RISC-V memory instructions. Power ISYNC maps on RISC-V -to a FENCE.I followed by a FENCE R,R; the latter fence is needed because -ISYNC is used to define a "control+control fence" dependency that is -not present in RVWMO.

-
- - ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 43. Mappings from ARM operations to RISC-V operations
ARM OperationRVWMO Mapping

Load

l{b|h|w|d}

Load-Acquire

fence rw, rw; l{b|h|w|d}; fence r,rw

Load-Exclusive

lr.{w|d}

Load-Acquire-Exclusive

lr.{w|d}.aqrl

Store

s{b|h|w|d}

Store-Release

fence rw,w; s{b|h|w|d}

Store-Exclusive

sc.{w|d}

Store-Release-Exclusive

sc.{w|d}.rl

dmb

fence rw,rw

dmb.ld

fence r,rw

dmb.st

fence w,w

isb

fence.i; fence r,r

-
-

Table 43 provides a mapping from ARM memory -operations onto RISC-V memory instructions. Since RISC-V does not -currently have plain load and store opcodes with aq or rl -annotations, ARM load-acquire and store-release operations should be -mapped using fences instead. Furthermore, in order to enforce -store-release-to-load-acquire ordering, there must be a FENCE RW,RW -between the store-release and load-acquire; Table 43 -enforces this by always placing the fence in front of each acquire -operation. ARM load-exclusive and store-exclusive instructions can -likewise map onto their RISC-V LR and SC equivalents, but instead of -placing a FENCE RW,RW in front of an LR with aq set, we simply also -set rl instead. ARM ISB maps on RISC-V to FENCE.I followed by -FENCE R,R similarly to how ISYNC maps for Power.

-
- - ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 44. Mappings from Linux memory primitives to RISC-V primitives.
Linux OperationRVWMO Mapping

smp_mb()

fence rw,rw

smp_rmb()

fence r,r

smp_wmb()

fence w,w

dma_rmb()

fence r,r

dma_wmb()

fence w,w

mb()

fence iorw,iorw

rmb()

fence ri,ri

wmb()

fence wo,wo

smp_load_acquire()

l{b|h|w|d}; fence r,rw

smp_store_release()

fence.tso; s{b|h|w|d}

Linux Construct

RVWMO AMO Mapping

atomic <op> relaxed

amo <op>.{w|d}

atomic <op> acquire

amo <op>.{w|d}.aq

atomic <op> release

amo <op>.{w|d}.rl

atomic <op>

amo <op>.{w|d}.aqrl

Linux Construct

RVWMO LR/SC Mapping

atomic <op> relaxed

loop:lr.{w|d}; <op>; sc.{w|d}; bnez loop

atomic <op> acquire

loop:lr.{w|d}.aq; <op>; sc.{w|d}; bnez loop

atomic <op> release

loop:lr.{w|d}; <op>; sc.{w|d}.aqrl^*; bnez loop OR

fence.tso; loop:lr.{w|d}; <op >; sc.{w|d}^*; bnez loop

atomic <op>

loop:lr.{w|d}.aq; <op>; sc.{w|d}.aqrl; bnez loop

-
-

With regards to Table 44, other -constructs (such as spinlocks) should follow accordingly. Platforms or -devices with non-coherent DMA may need additional synchronization (such -as cache flush or invalidate mechanisms); currently any such extra -synchronization will be device-specific.

-
-
-

Table 44 provides a mapping of Linux memory -ordering macros onto RISC-V memory instructions. The Linux fences -dma_rmb() and dma_wmb() map onto FENCE R,R and FENCE W,W, -respectively, since the RISC-V Unix Platform requires coherent DMA, but -would be mapped onto FENCE RI,RI and FENCE WO,WO, respectively, on a -platform with non-coherent DMA. Platforms with non-coherent DMA may also -require a mechanism by which cache lines can be flushed and/or -invalidated. Such mechanisms will be device-specific and/or standardized -in a future extension to the ISA.

-
-
-

The Linux mappings for release operations may seem stronger than -necessary, but these mappings are needed to cover some cases in which -Linux requires stronger orderings than the more intuitive mappings would -provide. In particular, as of the time this text is being written, Linux -is actively debating whether to require load-load, load-store, and -store-store orderings between accesses in one critical section and -accesses in a subsequent critical section in the same hart and protected -by the same synchronization object. Not all combinations of -FENCE RW,W/FENCE R,RW mappings with aq/rl mappings combine to -provide such orderings. There are a few ways around this problem, -including:

-
-
-
    -
  1. -

    Always use FENCE RW,W/FENCE R,RW, and never use aq/rl. This -suffices but is undesirable, as it defeats the purpose of the aq/rl -modifiers.

    -
  2. -
  3. -

    Always use aq/rl, and never use FENCE RW,W/FENCE R,RW. This does -not currently work due to the lack of load and store opcodes with aq -and rl modifiers.

    -
  4. -
  5. -

    Strengthen the mappings of release operations such that they would -enforce sufficient orderings in the presence of either type of acquire -mapping. This is the currently recommended solution, and the one shown -in Table 44.

    -
  6. -
-
-
-

RVWMO Mapping: (a) lw a0, 0(s0) (b) fence.tso // vs. fence rw,w (c) sd -x0,0(s1) …​ loop: (d) amoswap.d.aq a1,t1,0(s1) bnez a1,loop (e) lw -a2,0(s2)

-
-
-

For example, the critical section ordering rule currently being debated -by the Linux community would require (a) to be ordered before (e) in -Listing 9. If that will indeed be -required, then it would be insufficient for (b) to map as FENCE RW,W. -That said, these mappings are subject to change as the Linux Kernel -Memory Model evolves.

-
-
-
Listing 9. Orderings between critical sections in Linux
-
-
Linux Code:
-(a) int r0 = *x;
-       (bc) spin_unlock(y, 0);
-....
-....
-(d) spin_lock(y);
-(e) int r1 = *z;
-
-RVWMO Mapping:
-(a) lw a0, 0(s0)
-(b) fence.tso // vs. fence rw,w
-(c) sd x0,0(s1)
-....
-loop:
-(d) lr.d.aq a1,(s1)
-bnez a1,loop
-sc.d a1,t1,(s1)
-bnez a1,loop
-(e) lw a2,0(s2)
-
-
-
-

Table 45 provides a mapping of C11/C++11 atomic -operations onto RISC-V memory instructions. If load and store opcodes -with aq and rl modifiers are introduced, then the mappings in -Table 46 will suffice. Note however that -the two mappings only interoperate correctly if -atomic_<op>(memory_order_seq_cst) is mapped using an LR that has both -aq and rl set. -Even more importantly, a Table 45 sequentially consistent store, -followed by a Table 46 sequentially consistent load -can be reordered unless the Table 45 mapping of stores is -strengthened by either adding a second fence or mapping the store -to amoswap.rl instead.

-
- - ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 45. Mappings from C/C++ primitives to RISC-V primitives.
C/C++ ConstructRVWMO Mapping

Non-atomic load

l{b|h|w|d}

atomic_load(memory_order_relaxed)

l{b|h|w|d}

atomic_load(memory_order_acquire)

l{b|h|w|d}; fence r,rw

atomic_load(memory_order_seq_cst)

fence rw,rw; l{b|h|w|d}; fence r,rw

Non-atomic store

s{b|h|w|d}

atomic_store(memory_order_relaxed)

s{b|h|w|d}

atomic_store(memory_order_release)

fence rw,w; s{b|h|w|d}

atomic_store(memory_order_seq_cst)

fence rw,w; s{b|h|w|d}

atomic_thread_fence(memory_order_acquire)

fence r,rw

atomic_thread_fence(memory_order_release)

fence rw,w

atomic_thread_fence(memory_order_acq_rel)

fence.tso

atomic_thread_fence(memory_order_seq_cst)

fence rw,rw

C/C++ Construct

RVWMO AMO Mapping

atomic_<op>(memory_order_relaxed)

amo<op>.{w|d}

atomic_<op>(memory_order_acquire)

amo<op>.{w|d}.aq

atomic_<op>(memory_order_release)

amo<op>.{w|d}.rl

atomic_<op>(memory_order_acq_rel)

amo<op>.{w|d}.aqrl

atomic_<op>(memory_order_seq_cst)

amo<op>.{w|d}.aqrl

C/C++ Construct

RVWMO LR/SC Mapping

atomic_<op>(memory_order_relaxed)

loop:lr.{w|d}; <op>; sc.{w|d};

bnez loop

atomic_<op>(memory_order_acquire)

loop:lr.{w|d}.aq; <op>; sc.{w|d};

bnez loop

atomic_<op>(memory_order_release)

loop:lr.{w|d}; <op>; sc.{w|d}.rl;

bnez loop

atomic_<op>(memory_order_acq_rel)

loop:lr.{w|d}.aq; <op>; sc.{w|d}.rl;

bnez loop

atomic_<op>(memory_order_seq_cst)

loop:lr.{w|d}.aqrl; <op>;

sc.{w|d}.rl; bnez loop

- - ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 46. Hypothetical mappings from C/C++ primitives to RISC-V primitives, if native load-acquire and store-release opcodes are introduced.
C/C++ ConstructRVWMO Mapping

Non-atomic load

l{b|h|w|d}

atomic_load(memory_order_relaxed)

l{b|h|w|d}

atomic_load(memory_order_acquire)

l{b|h|w|d}.aq

atomic_load(memory_order_seq_cst)

l{b|h|w|d}.aq

Non-atomic store

s{b|h|w|d}

atomic_store(memory_order_relaxed)

s{b|h|w|d}

atomic_store(memory_order_release)

s{b|h|w|d}.rl

atomic_store(memory_order_seq_cst)

s{b|h|w|d}.rl

atomic_thread_fence(memory_order_acquire)

fence r,rw

atomic_thread_fence(memory_order_release)

fence rw,w

atomic_thread_fence(memory_order_acq_rel)

fence.tso

atomic_thread_fence(memory_order_seq_cst)

fence rw,rw

C/C++ Construct

RVWMO AMO Mapping

atomic_<op>(memory_order_relaxed)

amo<op>.{w|d}

atomic_<op>(memory_order_acquire)

amo<op>.{w|d}.aq

atomic_<op>(memory_order_release)

amo<op>.{w|d}.rl

atomic_<op>(memory_order_acq_rel)

amo<op>.{w|d}.aqrl

atomic_<op>(memory_order_seq_cst)

amo<op>.{w|d}.aqrl

C/C++ Construct

RVWMO LR/SC Mapping

atomic_<op>(memory_order_relaxed)

lr.{w|d}; <op>; sc.{w|d}

atomic_<op>(memory_order_acquire)

lr.{w|d}.aq; <op>; sc.{w|d}

atomic_<op>(memory_order_release)

lr.{w|d}; <op>; sc.{w|d}.rl

atomic_<op>(memory_order_acq_rel)

lr.{w|d}.aq; <op>; sc.{w|d}.rl

atomic_<op>(memory_order_seq_cst)

lr.{w|d}.aq* <op>; sc.{w|d}.rl

* must be lr.{w|d}.aqrl in order to interoperate with code mapped per Table 45

-
-

Any AMO can be emulated by an LR/SC pair, but care must be taken to -ensure that any PPO orderings that originate from the LR are also made -to originate from the SC, and that any PPO orderings that terminate at -the SC are also made to terminate at the LR. For example, the LR must -also be made to respect any data dependencies that the AMO has, given -that load operations do not otherwise have any notion of a data -dependency. Likewise, the effect a FENCE R,R elsewhere in the same hart -must also be made to apply to the SC, which would not otherwise respect -that fence. The emulator may achieve this effect by simply mapping AMOs -onto lr.aq; <op>; sc.aqrl, matching the mapping used elsewhere for -fully ordered atomics.

-
-
-

These C11/C++11 mappings require the platform to provide the following -Physical Memory Attributes (as defined in the RISC-V Privileged ISA) for -all memory:

-
-
-
    -
  • -

    main memory

    -
  • -
  • -

    coherent

    -
  • -
  • -

    AMOArithmetic

    -
  • -
  • -

    RsrvEventual

    -
  • -
-
-
-

Platforms with different attributes may require different mappings, or -require platform-specific SW (e.g., memory-mapped I/O).

-
-
-
-

A.6. Implementation Guidelines

-
-

The RVWMO and RVTSO memory models by no means preclude -microarchitectures from employing sophisticated speculation techniques -or other forms of optimization in order to deliver higher performance. -The models also do not impose any requirement to use any one particular -cache hierarchy, nor even to use a cache coherence protocol at all. -Instead, these models only specify the behaviors that can be exposed to -software. Microarchitectures are free to use any pipeline design, any -coherent or non-coherent cache hierarchy, any on-chip interconnect, -etc., as long as the design only admits executions that satisfy the -memory model rules. That said, to help people understand the actual -implementations of the memory model, in this section we provide some -guidelines on how architects and programmers should interpret the -models' rules.

-
-
-

Both RVWMO and RVTSO are multi-copy atomic (or -other-multi-copy-atomic): any store value that is visible to a hart -other than the one that originally issued it must also be conceptually -visible to all other harts in the system. In other words, harts may -forward from their own previous stores before those stores have become -globally visible to all harts, but no early inter-hart forwarding is -permitted. Multi-copy atomicity may be enforced in a number of ways. It -might hold inherently due to the physical design of the caches and store -buffers, it may be enforced via a single-writer/multiple-reader cache -coherence protocol, or it might hold due to some other mechanism.

-
-
-

Although multi-copy atomicity does impose some restrictions on the -microarchitecture, it is one of the key properties keeping the memory -model from becoming extremely complicated. For example, a hart may not -legally forward a value from a neighbor hart’s private store buffer -(unless of course it is done in such a way that no new illegal behaviors -become architecturally visible). Nor may a cache coherence protocol -forward a value from one hart to another until the coherence protocol -has invalidated all older copies from other caches. Of course, -microarchitectures may (and high-performance implementations likely -will) violate these rules under the covers through speculation or other -optimizations, as long as any non-compliant behaviors are not exposed to -the programmer.

-
-
-

As a rough guideline for interpreting the PPO rules in RVWMO, we expect -the following from the software perspective:

-
-
-
    -
  • -

    programmers will use PPO rules 1 and 4-8 regularly and actively.

    -
  • -
  • -

    expert programmers will use PPO rules 9-11 to speed up critical paths -of important data structures.

    -
  • -
  • -

    even expert programmers will rarely if ever use PPO rules 2-3 and -12-13 directly. -These are included to facilitate common microarchitectural optimizations -(rule 2) and the operational formal modeling approach (rules 3 and -12-13) described -in Section B.3. They also facilitate the -process of porting code from other architectures that have similar -rules.

    -
  • -
-
-
-

We also expect the following from the hardware perspective:

-
-
-
    -
  • -

    PPO rules 1 and 3-6 reflect -well-understood rules that should pose few surprises to architects.

    -
  • -
  • -

    PPO rule 2 reflects a natural and common hardware -optimization, but one that is very subtle and hence is worth double -checking carefully.

    -
  • -
  • -

    PPO rule 7 may not be immediately obvious to -architects, but it is a standard memory model requirement

    -
  • -
  • -

    The load value axiom, the atomicity axiom, and PPO rules -8-13 reflect rules that most -hardware implementations will enforce naturally, unless they contain -extreme optimizations. Of course, implementations should make sure to -double check these rules nevertheless. Hardware must also ensure that -syntactic dependencies are not optimized away.

    -
  • -
-
-
-

Architectures are free to implement any of the memory model rules as -conservatively as they choose. For example, a hardware implementation -may choose to do any or all of the following:

-
-
-
    -
  • -

    interpret all fences as if they were FENCE RW,RW (or FENCE IORW,IORW, -if I/O is involved), regardless of the bits actually set

    -
  • -
  • -

    implement all fences with PW and SR as if they were FENCE RW,RW (or -FENCE IORW,IORW, if I/O is involved), as PW with SR is the most -expensive of the four possible main memory ordering components anyway

    -
  • -
  • -

    emulate aq and rl as described in Section A.5

    -
  • -
  • -

    enforcing all same-address load-load ordering, even in the presence of -patterns such as fri-rfi and RSW

    -
  • -
  • -

    forbid any forwarding of a value from a store in the store buffer to a -subsequent AMO or LR to the same address

    -
  • -
  • -

    forbid any forwarding of a value from an AMO or SC in the store buffer -to a subsequent load to the same address

    -
  • -
  • -

    implement TSO on all memory accesses, and ignore any main memory -fences that do not include PW and SR ordering (e.g., as Ztso -implementations will do)

    -
  • -
  • -

    implement all atomics to be RCsc or even fully ordered, regardless of -annotation

    -
  • -
-
-
-

Architectures that implement RVTSO can safely do the following:

-
-
-
    -
  • -

    Ignore all fences that do not have both PW and SR (unless the fence -also orders I/O)

    -
  • -
  • -

    Ignore all PPO rules except for rules 4 through 7, since the rest -are redundant with other PPO rules under RVTSO assumptions

    -
  • -
-
-
-

Other general notes:

-
-
-
    -
  • -

    Silent stores (i.e., stores that write the same value that already -exists at a memory location) behave like any other store from a memory -model point of view. Likewise, AMOs which do not actually change the -value in memory (e.g., an AMOMAX for which the value in rs2 is smaller -than the value currently in memory) are still semantically considered -store operations. Microarchitectures that attempt to implement silent -stores must take care to ensure that the memory model is still obeyed, -particularly in cases such as RSW Section A.3.5 -which tend to be incompatible with silent stores.

    -
  • -
  • -

    Writes may be merged (i.e., two consecutive writes to the same address -may be merged) or subsumed (i.e., the earlier of two back-to-back writes -to the same address may be elided) as long as the resulting behavior -does not otherwise violate the memory model semantics.

    -
  • -
-
-
-

The question of write subsumption can be understood from the following -example:

-
- - ---- - - - - - - -
Table 47. Write subsumption litmus test, allowed execution
------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Hart 0Hart 1

li t1, 3

li t3, 2

li t2, 1

(a)

sw t1,0(s0)

(d)

lw a0,0(s1)

(b)

fence w, w

(e)

sw a0,0(s0)

(c)

sw t2,0(s1)

(f)

sw t3,0(s0)

--- - - - - - -
-
-litmus subsumption -
-
-
-

As written, if the load (d) reads value 1, then (a) must -precede (f) in the global memory order:

-
-
-
    -
  • -

    (a) precedes (c) in the global memory order because of rule 4

    -
  • -
  • -

    (c) precedes (d) in the global memory order because of the Load -Value axiom

    -
  • -
  • -

    (d) precedes (e) in the global memory order because of rule 10

    -
  • -
  • -

    (e) precedes (f) in the global memory order because of rule 1

    -
  • -
-
-
-

In other words the final value of the memory location whose address is -in s0 must be 2 (the value written by the store (f)) and -cannot be 3 (the value written by the store (a)).

-
-
-

A very aggressive microarchitecture might erroneously decide to discard -(e), as (f) supersedes it, and this may in turn lead the -microarchitecture to break the now-eliminated dependency between (d) and -(f) (and hence also between (a) and (f)). This would violate the memory -model rules, and hence it is forbidden. Write subsumption may in other -cases be legal, if for example there were no data dependency between (d) -and (e).

-
-
-

A.6.1. Possible Future Extensions

-
-

We expect that any or all of the following possible future extensions -would be compatible with the RVWMO memory model:

-
-
-
    -
  • -

    "V" vector ISA extensions

    -
  • -
  • -

    "J" JIT extension

    -
  • -
  • -

    Native encodings for load and store opcodes with aq and rl set

    -
  • -
  • -

    Fences limited to certain addresses

    -
  • -
  • -

    Cache writeback/flush/invalidate/etc.instructions

    -
  • -
-
-
-
-
-

A.7. Known Issues

-
-

A.7.1. Mixed-size RSW

- - ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 48. Mixed-size discrepancy (permitted by axiomatic models, forbidden by operational model)
Hart 0Hart 1

li t1, 1

li t1, 1

(a)

lw a0,0(s0)

(d)

lw a1,0(s1)

(b)

fence rw,rw

(e)

amoswap.w.rl a2,t1,0(s2)

(c)

sw t1,0(s1)

(f)

ld a3,0(s2)

(g)

lw a4,4(s2)

xor a5,a4,a4

add s0,s0,a5

(h)

sw t1,0(s0)

Outcome: a0=1, a1=1, a2=0, a3=1, a4=0

- - ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 49. Mixed-size discrepancy (permitted by axiomatic models, forbidden by operational model)
Hart 0Hart 1

li t1, 1

li t1, 1

(a)

lw a0,0(s0)

(d)

ld a1,0(s1)

(b)

fence rw,rw

(e)

lw a2,4(s1)

(c)

sw t1,0(s1)

xor a3,a2,a2

add s0,s0,a3

(f)

sw t1,0(s0)

Outcome: a0=1, a1=1, a2=0

- - ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 50. Mixed-size discrepancy (permitted by axiomatic models, forbidden by operational model)
Hart 0Hart 1

li t1, 1

li t1, 1

(a)

lw a0,0(s0)

(d)

sw t1,4(s1)

(b)

fence rw,rw

(e)

ld a1,0(s1)

(c)

sw t1,0(s1)

(f)

lw a2,4(s1)

xor a3,a2,a2

add s0,s0,a3

(g)

sw t1,0(s0)

Outcome: a0=1, a1=0x100000001, a2=1

-
-

There is a known discrepancy between the operational and axiomatic -specifications within the family of mixed-size RSW variants shown in -Table 48-Table 50. -To address this, we may choose to add something like the following new -PPO rule: Memory operation a precedes memory operation -b in preserved program order (and hence also in the global -memory order) if a precedes b in program -order, a and b both access regular main -memory (rather than I/O regions), a is a load, -b is a store, there is a load m between -a and b, there is a byte x -that both a and m read, there is no store -between a and m that writes to -x, and m precedes b in PPO. In -other words, in herd syntax, we may choose to add -(po-loc & rsw);ppo;[W] to PPO. Many implementations will already -enforce this ordering naturally. As such, even though this rule is not -official, we recommend that implementers enforce it nevertheless in -order to ensure forwards compatibility with the possible future addition -of this rule to RVWMO.

-
-
-
-
-
-
-

Appendix B: Formal Memory Model Specifications, Version 0.1

-
-
-

To facilitate formal analysis of RVWMO, this chapter presents a set of -formalizations using different tools and modeling approaches. Any -discrepancies are unintended; the expectation is that the models -describe exactly the same sets of legal behaviors.

-
-
-

This appendix should be treated as commentary; all normative material is -provided in Chapter 17 and in the rest of -the main body of the ISA specification. All currently known -discrepancies are listed in -Section A.7. Any other -discrepancies are unintentional.

-
-
-

B.1. Formal Axiomatic Specification in Alloy

-
-

We present a formal specification of the RVWMO memory model in Alloy -(alloy.mit.edu). This model is available online at -github.com/daniellustig/riscv-memory-model.

-
-
-

The online material also contains some litmus tests and some examples of -how Alloy can be used to model check some of the mappings in Section A.5.

-
-
-
Listing 10. The RVWMO memory model formalized in Alloy (1/5: PPO)
-
-
// =RVWMO PPO=
-
-// Preserved Program Order
-fun ppo : Event->Event {
-  // same-address ordering
-  po_loc :> Store
-  + rdw
-  + (AMO + StoreConditional) <: rfi
-
-  // explicit synchronization
-  + ppo_fence
-  + Acquire <: ^po :> MemoryEvent
-  + MemoryEvent <: ^po :> Release
-  + RCsc <: ^po :> RCsc
-  + pair
-
-  // syntactic dependencies
-  + addrdep
-  + datadep
-  + ctrldep :> Store
-
-  // pipeline dependencies
-  + (addrdep+datadep).rfi
-  + addrdep.^po :> Store
-}
-
-// the global memory order respects preserved program order
-fact { ppo in ^gmo }
-
-
-
-
The RVWMO memory model formalized in Alloy (2/5: Axioms)
-
-
// =RVWMO axioms=
-
-// Load Value Axiom
-fun candidates[r: MemoryEvent] : set MemoryEvent {
-  (r.~^gmo & Store & same_addr[r]) // writes preceding r in gmo
-  + (r.^~po & Store & same_addr[r]) // writes preceding r in po
-}
-
-fun latest_among[s: set Event] : Event { s - s.~^gmo }
-
-pred LoadValue {
-  all w: Store | all r: Load |
-    w->r in rf <=> w = latest_among[candidates[r]]
-}
-
-// Atomicity Axiom
-pred Atomicity {
-  all r: Store.~pair |            // starting from the lr,
-    no x: Store & same_addr[r] |  // there is no store x to the same addr
-      x not in same_hart[r]       // such that x is from a different hart,
-      and x in r.~rf.^gmo         // x follows (the store r reads from) in gmo,
-      and r.pair in x.^gmo        // and r follows x in gmo
-}
-
-// Progress Axiom implicit: Alloy only considers finite executions
-
-pred RISCV_mm { LoadValue and Atomicity /* and Progress */ }
-
-
-
-
Listing 11. The RVWMO memory model formalized in Alloy (3/5: model of memory)
-
-
//Basic model of memory
-
-sig Hart {  // hardware thread
-  start : one Event
-}
-sig Address {}
-abstract sig Event {
-  po: lone Event // program order
-}
-
-abstract sig MemoryEvent extends Event {
-  address: one Address,
-  acquireRCpc: lone MemoryEvent,
-  acquireRCsc: lone MemoryEvent,
-  releaseRCpc: lone MemoryEvent,
-  releaseRCsc: lone MemoryEvent,
-  addrdep: set MemoryEvent,
-  ctrldep: set Event,
-  datadep: set MemoryEvent,
-  gmo: set MemoryEvent,  // global memory order
-  rf: set MemoryEvent
-}
-sig LoadNormal extends MemoryEvent {} // l{b|h|w|d}
-sig LoadReserve extends MemoryEvent { // lr
-  pair: lone StoreConditional
-}
-sig StoreNormal extends MemoryEvent {}       // s{b|h|w|d}
-// all StoreConditionals in the model are assumed to be successful
-sig StoreConditional extends MemoryEvent {}  // sc
-sig AMO extends MemoryEvent {}               // amo
-sig NOP extends Event {}
-
-fun Load : Event { LoadNormal + LoadReserve + AMO }
-fun Store : Event { StoreNormal + StoreConditional + AMO }
-
-sig Fence extends Event {
-  pr: lone Fence, // opcode bit
-  pw: lone Fence, // opcode bit
-  sr: lone Fence, // opcode bit
-  sw: lone Fence  // opcode bit
-}
-sig FenceTSO extends Fence {}
-
-/* Alloy encoding detail: opcode bits are either set (encoded, e.g.,
- * as f.pr in iden) or unset (f.pr not in iden).  The bits cannot be used for
- * anything else */
-fact { pr + pw + sr + sw in iden }
-// likewise for ordering annotations
-fact { acquireRCpc + acquireRCsc + releaseRCpc + releaseRCsc in iden }
-// don't try to encode FenceTSO via pr/pw/sr/sw; just use it as-is
-fact { no FenceTSO.(pr + pw + sr + sw) }
-
-
-
-
Listing 12. The RVWMO memory model formalized in Alloy (4/5: Basic model rules)
-
-
// =Basic model rules=
-
-// Ordering annotation groups
-fun Acquire : MemoryEvent { MemoryEvent.acquireRCpc + MemoryEvent.acquireRCsc }
-fun Release : MemoryEvent { MemoryEvent.releaseRCpc + MemoryEvent.releaseRCsc }
-fun RCpc : MemoryEvent { MemoryEvent.acquireRCpc + MemoryEvent.releaseRCpc }
-fun RCsc : MemoryEvent { MemoryEvent.acquireRCsc + MemoryEvent.releaseRCsc }
-
-// There is no such thing as store-acquire or load-release, unless it's both
-fact { Load & Release in Acquire }
-fact { Store & Acquire in Release }
-
-// FENCE PPO
-fun FencePRSR : Fence { Fence.(pr & sr) }
-fun FencePRSW : Fence { Fence.(pr & sw) }
-fun FencePWSR : Fence { Fence.(pw & sr) }
-fun FencePWSW : Fence { Fence.(pw & sw) }
-
-fun ppo_fence : MemoryEvent->MemoryEvent {
-    (Load  <: ^po :> FencePRSR).(^po :> Load)
-  + (Load  <: ^po :> FencePRSW).(^po :> Store)
-  + (Store <: ^po :> FencePWSR).(^po :> Load)
-  + (Store <: ^po :> FencePWSW).(^po :> Store)
-  + (Load  <: ^po :> FenceTSO) .(^po :> MemoryEvent)
-  + (Store <: ^po :> FenceTSO) .(^po :> Store)
-}
-
-// auxiliary definitions
-fun po_loc : Event->Event { ^po & address.~address }
-fun same_hart[e: Event] : set Event { e + e.^~po + e.^po }
-fun same_addr[e: Event] : set Event { e.address.~address }
-
-// initial stores
-fun NonInit : set Event { Hart.start.*po }
-fun Init : set Event { Event - NonInit }
-fact { Init in StoreNormal }
-fact { Init->(MemoryEvent & NonInit) in ^gmo }
-fact { all e: NonInit | one e.*~po.~start }  // each event is in exactly one hart
-fact { all a: Address | one Init & a.~address } // one init store per address
-fact { no Init <: po and no po :> Init }
-
-
-
-
Listing 13. The RVWMO memory model formalized in Alloy (5/5: Auxiliaries)
-
-
// po
-fact { acyclic[po] }
-
-// gmo
-fact { total[^gmo, MemoryEvent] } // gmo is a total order over all MemoryEvents
-
-//rf
-fact { rf.~rf in iden } // each read returns the value of only one write
-fact { rf in Store <: address.~address :> Load }
-fun rfi : MemoryEvent->MemoryEvent { rf & (*po + *~po) }
-
-//dep
-fact { no StoreNormal <: (addrdep + ctrldep + datadep) }
-fact { addrdep + ctrldep + datadep + pair in ^po }
-fact { datadep in datadep :> Store }
-fact { ctrldep.*po in ctrldep }
-fact { no pair & (^po :> (LoadReserve + StoreConditional)).^po }
-fact { StoreConditional in LoadReserve.pair } // assume all SCs succeed
-
-// rdw
-fun rdw : Event->Event {
-  (Load <: po_loc :> Load)  // start with all same_address load-load pairs,
-  - (~rf.rf)                // subtract pairs that read from the same store,
-  - (po_loc.rfi)            // and subtract out "fri-rfi" patterns
-}
-
-// filter out redundant instances and/or visualizations
-fact { no gmo & gmo.gmo } // keep the visualization uncluttered
-fact { all a: Address | some a.~address }
-
-// =Optional: opcode encoding restrictions=
-
-// the list of blessed fences
-fact { Fence in
-  Fence.pr.sr
-  + Fence.pw.sw
-  + Fence.pr.pw.sw
-  + Fence.pr.sr.sw
-  + FenceTSO
-  + Fence.pr.pw.sr.sw
-}
-
-pred restrict_to_current_encodings {
-  no (LoadNormal + StoreNormal) & (Acquire + Release)
-}
-
-// =Alloy shortcuts=
-pred acyclic[rel: Event->Event] { no iden & ^rel }
-pred total[rel: Event->Event, bag: Event] {
-  all disj e, e': bag | e->e' in rel + ~rel
-  acyclic[rel]
-}
-
-
-
-
-

B.2. Formal Axiomatic Specification in Herd

-
-

The tool herd takes a memory model and a litmus test as -input and simulates the execution of the test on top of the memory -model. Memory models are written in the domain specific language Cat. -This section provides two Cat memory model of RVWMO. The first model, -Listing 15, follows the global memory order, -Chapter Chapter 18, definition of RVWMO, as much -as is possible for a Cat model. The second model, -Listing 16, is an equivalent, more efficient, -partial order based RVWMO model.

-
-
-

The simulator herd is part of the diy tool -suite — see diy.inria.fr for software and documentation. The -models and more are available online at diy.inria.fr/cats7/riscv/.

-
-
-
Listing 14. riscv-defs.cat, a herd definition of preserved program order (1/3)
-
-
(*************)
-(* Utilities *)
-(*************)
-
-(* All fence relations *)
-let fence.r.r = [R];fencerel(Fence.r.r);[R]
-let fence.r.w = [R];fencerel(Fence.r.w);[W]
-let fence.r.rw = [R];fencerel(Fence.r.rw);[M]
-let fence.w.r = [W];fencerel(Fence.w.r);[R]
-let fence.w.w = [W];fencerel(Fence.w.w);[W]
-let fence.w.rw = [W];fencerel(Fence.w.rw);[M]
-let fence.rw.r = [M];fencerel(Fence.rw.r);[R]
-let fence.rw.w = [M];fencerel(Fence.rw.w);[W]
-let fence.rw.rw = [M];fencerel(Fence.rw.rw);[M]
-let fence.tso =
-  let f = fencerel(Fence.tso) in
-  ([W];f;[W]) | ([R];f;[M])
-
-let fence =
-  fence.r.r | fence.r.w | fence.r.rw |
-  fence.w.r | fence.w.w | fence.w.rw |
-  fence.rw.r | fence.rw.w | fence.rw.rw |
-  fence.tso
-
-(* Same address, no W to the same address in-between *)
-let po-loc-no-w = po-loc \ (po-loc?;[W];po-loc)
-(* Read same write *)
-let rsw = rf^-1;rf
-(* Acquire, or stronger  *)
-let AQ = Acq|AcqRel
-(* Release or stronger *)
-and RL = RelAcqRel
-(* All RCsc *)
-let RCsc = Acq|Rel|AcqRel
-(* Amo events are both R and W, relation rmw relates paired lr/sc *)
-let AMO = R & W
-let StCond = range(rmw)
-
-(*************)
-(* ppo rules *)
-(*************)
-
-(* Overlapping-Address Orderings *)
-let r1 = [M];po-loc;[W]
-and r2 = ([R];po-loc-no-w;[R]) \ rsw
-and r3 = [AMO|StCond];rfi;[R]
-(* Explicit Synchronization *)
-and r4 = fence
-and r5 = [AQ];po;[M]
-and r6 = [M];po;[RL]
-and r7 = [RCsc];po;[RCsc]
-and r8 = rmw
-(* Syntactic Dependencies *)
-and r9 = [M];addr;[M]
-and r10 = [M];data;[W]
-and r11 = [M];ctrl;[W]
-(* Pipeline Dependencies *)
-and r12 = [R];(addr|data);[W];rfi;[R]
-and r13 = [R];addr;[M];po;[W]
-
-let ppo = r1 | r2 | r3 | r4 | r5 | r6 | r7 | r8 | r9 | r10 | r11 | r12 | r13
-
-
-
-
Listing 15. riscv.cat, a herd version of the RVWMO memory model (2/3)
-
-
Total
-
-(* Notice that herd has defined its own rf relation *)
-
-(* Define ppo *)
-include "riscv-defs.cat"
-
-(********************************)
-(* Generate global memory order *)
-(********************************)
-
-let gmo0 = (* precursor: ie build gmo as an total order that include gmo0 *)
-  loc & (W\FW) * FW | # Final write after any write to the same location
-  ppo |               # ppo compatible
-  rfe                 # includes herd external rf (optimization)
-
-(* Walk over all linear extensions of gmo0 *)
-with  gmo from linearizations(M\IW,gmo0)
-
-(* Add initial writes upfront -- convenient for computing rfGMO *)
-let gmo = gmo | loc & IW * (M\IW)
-
-(**********)
-(* Axioms *)
-(**********)
-
-(* Compute rf according to the load value axiom, aka rfGMO *)
-let WR = loc & ([W];(gmo|po);[R])
-let rfGMO = WR \ (loc&([W];gmo);WR)
-
-(* Check equality of herd rf and of rfGMO *)
-empty (rf\rfGMO)|(rfGMO\rf) as RfCons
-
-(* Atomicity axiom *)
-let infloc = (gmo & loc)^-1
-let inflocext = infloc & ext
-let winside  = (infloc;rmw;inflocext) & (infloc;rf;rmw;inflocext) & [W]
-empty winside as Atomic
-
-
-
-
Listing 16. riscv.cat, an alternative herd presentation of the RVWMO memory model (3/3)
-
-
Partial
-
-(***************)
-(* Definitions *)
-(***************)
-
-(* Define ppo *)
-include "riscv-defs.cat"
-
-(* Compute coherence relation *)
-include "cos-opt.cat"
-
-(**********)
-(* Axioms *)
-(**********)
-
-(* Sc per location *)
-acyclic co|rf|fr|po-loc as Coherence
-
-(* Main model axiom *)
-acyclic co|rfe|fr|ppo as Model
-
-(* Atomicity axiom *)
-empty rmw & (fre;coe) as Atomic
-
-
-
-
-

B.3. An Operational Memory Model

-
-

This is an alternative presentation of the RVWMO memory model in -operational style. It aims to admit exactly the same extensional -behavior as the axiomatic presentation: for any given program, admitting -an execution if and only if the axiomatic presentation allows it.

-
-
-

The axiomatic presentation is defined as a predicate on complete -candidate executions. In contrast, this operational presentation has an -abstract microarchitectural flavor: it is expressed as a state machine, -with states that are an abstract representation of hardware machine -states, and with explicit out-of-order and speculative execution (but -abstracting from more implementation-specific microarchitectural details -such as register renaming, store buffers, cache hierarchies, cache -protocols, etc.). As such, it can provide useful intuition. It can also -construct executions incrementally, making it possible to interactively -and randomly explore the behavior of larger examples, while the -axiomatic model requires complete candidate executions over which the -axioms can be checked.

-
-
-

The operational presentation covers mixed-size execution, with -potentially overlapping memory accesses of different power-of-two byte -sizes. Misaligned accesses are broken up into single-byte accesses.

-
-
-

The operational model, together with a fragment of the RISC-V ISA -semantics (RV64I and A), are integrated into the rmem exploration tool -(github.com/rems-project/rmem). rmem can explore litmus tests -(see Section A.2) and small ELF binaries -exhaustively, pseudorandomly and interactively. In rmem, the ISA -semantics is expressed explicitly in Sail (see -github.com/rems-project/sail for the Sail language, and -github.com/rems-project/sail-riscv for the RISC-V ISA model), -and the concurrency semantics is expressed in Lem (see -github.com/rems-project/lem for the Lem language).

-
-
-

rmem has a command-line interface and a web-interface. The -web-interface runs entirely on the client side, and is provided online -together with a library of litmus tests: -www.cl.cam.ac.uk/. The command-line interface is -faster than the web-interface, specially in exhaustive mode.

-
-
-

Below is an informal introduction of the model states and transitions. -The description of the formal model starts in the next subsection.

-
-
-

Terminology: In contrast to the axiomatic presentation, here every -memory operation is either a load or a store. Hence, AMOs give rise to -two distinct memory operations, a load and a store. When used in -conjunction with instruction, the terms load and store refer -to instructions that give rise to such memory operations. As such, both -include AMO instructions. The term acquire refers to an instruction -(or its memory operation) with the acquire-RCpc or acquire-RCsc -annotation. The term release refers to an instruction (or its memory -operation) with the release-RCpc or release-RCsc annotation.

-
-
-

Model states

-
-
-

Model states: A model state consists of a shared memory and a tuple of hart states.

-
-
-
-Diagram -
-
-
-

The shared memory state records all the memory store operations that -have propagated so far, in the order they propagated (this can be made -more efficient, but for simplicity of the presentation we keep it this -way).

-
-
-

Each hart state consists principally of a tree of instruction instances, -some of which have been finished, and some of which have not. -Non-finished instruction instances can be subject to restart, e.g. if -they depend on an out-of-order or speculative load that turns out to be -unsound.

-
-
-

Conditional branch and indirect jump instructions may have multiple -successors in the instruction tree. When such instruction is finished, -any un-taken alternative paths are discarded.

-
-
-

Each instruction instance in the instruction tree has a state that -includes an execution state of the intra-instruction semantics (the ISA -pseudocode for this instruction). The model uses a formalization of the -intra-instruction semantics in Sail. One can think of the execution -state of an instruction as a representation of the pseudocode control -state, pseudocode call stack, and local variable values. An instruction -instance state also includes information about the instance’s memory and -register footprints, its register reads and writes, its memory -operations, whether it is finished, etc.

-
-
-

Model transitions

-
-
-

The model defines, for any model state, the set of allowed transitions, -each of which is a single atomic step to a new abstract machine state. -Execution of a single instruction will typically involve many -transitions, and they may be interleaved in operational-model execution -with transitions arising from other instructions. Each transition arises -from a single instruction instance; it will change the state of that -instance, and it may depend on or change the rest of its hart state and -the shared memory state, but it does not depend on other hart states, -and it will not change them. The transitions are introduced below and -defined in Section B.3.5, with a precondition and -a construction of the post-transition model state for each.

-
-
-

Transitions for all instructions:

-
-
-
    -
  • -

    Fetch instruction: This transition represents a fetch and decode of a new instruction instance, as a program order successor of a previously fetched -instruction instance (or the initial fetch address).

    -
  • -
-
-
-

The model assumes the instruction memory is fixed; it does not describe -the behavior of self-modifying code. In particular, the Fetch instruction transition does -not generate memory load operations, and the shared memory is not -involved in the transition. Instead, the model depends on an external -oracle that provides an opcode when given a memory location.

-
-
-
    -
  • -

    Register write: This is a write of a register value.

    -
  • -
  • -

    Register read: This is a read of a register value from the most recent -program-order-predecessor instruction instance that writes to that -register.

    -
  • -
  • -

    Pseudocode internal step: This covers pseudocode internal computation: arithmetic, function -calls, etc.

    -
  • -
  • -

    Finish instruction: At this point the instruction pseudocode is done, the instruction cannot be restarted, memory accesses cannot be discarded, and all memory -effects have taken place. For conditional branch and indirect jump -instructions, any program order successors that were fetched from an -address that is not the one that was written to the pc register are -discarded, together with the sub-tree of instruction instances below -them.

    -
  • -
-
-
-

Transitions specific to load instructions:

-
-
-
    -
  • -

    Initiate memory load operations: At this point the memory footprint of the load instruction is -provisionally known (it could change if earlier instructions are -restarted) and its individual memory load operations can start being -satisfied.

    -
  • -
-
-
- -
-
-
    -
  • -

    Complete load operations: At this point all the memory load operations of the instruction have -been entirely satisfied and the instruction pseudocode can continue -executing. A load instruction can be subject to being restarted until -the transition. But, under some conditions, the model might treat a load -instruction as non-restartable even before it is finished (e.g. see ).

    -
  • -
-
-
-

Transitions specific to store instructions:

-
-
- -
-
- -
-
-
    -
  • -

    Complete store operations: At this point all the memory store operations of the instruction -have been propagated to memory, and the instruction pseudocode can -continue executing.

    -
  • -
-
-
-

Transitions specific to sc instructions:

-
-
- -
-
-

Transitions specific to AMO instructions:

-
-
- -
-
-

Transitions specific to fence instructions:

-
-
- -
-
-

The transitions labeled stem fd03c19463adc4514951b7e42ee16e38 can always be taken eagerly, -as soon as their precondition is satisfied, without excluding other -behavior; the stem f9d6fbc7d894d9fca4a17872ba87643c cannot. Although Fetch instruction is marked with a -stem f9d6fbc7d894d9fca4a17872ba87643c, it can be taken eagerly as long as it is not -taken infinitely many times.

-
-
-

An instance of a non-AMO load instruction, after being fetched, will -typically experience the following transitions in this order:

-
- -
-

Before, between and after the transitions above, any number of -Pseudocode internal step transitions may appear. In addition, a Fetch instruction transition for fetching the -instruction in the next program location will be available until it is -taken.

-
-
-

This concludes the informal description of the operational model. The -following sections describe the formal operational model.

-
-
-

B.3.1. Intra-instruction Pseudocode Execution

-
-

The intra-instruction semantics for each instruction instance is -expressed as a state machine, essentially running the instruction -pseudocode. Given a pseudocode execution state, it computes the next -state. Most states identify a pending memory or register operation, -requested by the pseudocode, which the memory model has to do. The -states are (this is a tagged union; tags in small-caps):

-
- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Load_mem(kind, address, size, load_continuation)

- memory load -operation

Early_sc_fail(res_continuation)

- allow sc to fail early

Store_ea(kind, address, size, next_state)

- memory store -effective address

Store_memv(mem_value, store_continuation)

- memory store value

Fence(kind, next_state)

- fence

Read_reg(reg_name, read_continuation)

- register read

Write_reg(reg_name, reg_value, next_state)

- register write

Internal(next_state)

- pseudocode internal step

Done

- end of pseudocode

-
-

Here:

-
-
-
    -
  • -

    mem_value and reg_value are lists of bytes;

    -
  • -
  • -

    address is an integer of XLEN bits;

    -
  • -
-
-
-

for load/store, kind identifies whether it is lr/sc, -acquire-RCpc/release-RCpc, acquire-RCsc/release-RCsc, -acquire-release-RCsc; -* for fence, kind identifies whether it is a normal or TSO, and (for -normal fences) the predecessor and successor ordering bits; -* reg_name identifies a register and a slice thereof (start and end bit -indices); and the continuations describe how the instruction instance will continue -for each value that might be provided by the surrounding memory model -(the load_continuation and read_continuation take the value loaded -from memory and read from the previous register write, the -store_continuation takes false for an sc that failed and true in -all other cases, and res_continuation takes false if the sc fails -and true otherwise).

-
-
- - - - - -
- - -
-

For example, given the load instruction lw x1,0(x2), an execution will -typically go as follows. The initial execution state will be computed -from the pseudocode for the given opcode. This can be expected to be -Read_reg(x2, read_continuation). Feeding the most recently written -value of register x2 (the instruction semantics will be blocked if -necessary until the register value is available), say 0x4000, to -read_continuation returns Load_mem(plain_load, 0x4000, 4, -load_continuation). Feeding the 4-byte value loaded from memory -location 0x4000, say 0x42, to load_continuation returns -Write_reg(x1, 0x42, Done). Many Internal(next_state) states may -appear before and between the states above.

-
-
-
-
-

Notice that writing to memory is split into two steps, Store_ea and -Store_memv: the first one makes the memory footprint of the store -provisionally known, and the second one adds the value to be stored. We -ensure these are paired in the pseudocode (Store_ea followed by -Store_memv), but there may be other steps between them.

-
-
- - - - - -
- - -
-

It is observable that the Store_ea can occur before the value to be -stored is determined. For example, for the litmus test -LB+fence.r.rw+data-po to be allowed by the operational model (as it is -by RVWMO), the first store in Hart 1 has to take the Store_ea step -before its value is determined, so that the second store can see it is -to a non-overlapping memory footprint, allowing the second store to be -committed out of order without violating coherence.

-
-
-
-
-

The pseudocode of each instruction performs at most one store or one -load, except for AMOs that perform exactly one load and one store. Those -memory accesses are then split apart into the architecturally atomic -units by the hart semantics (see Initiate memory load operations and Initiate memory store operation footprints below).

-
-
-

Informally, each bit of a register read should be satisfied from a -register write by the most recent (in program order) instruction -instance that can write that bit (or from the hart’s initial register -state if there is no such write). Hence, it is essential to know the -register write footprint of each instruction instance, which we -calculate when the instruction instance is created (see the Festch instruction action of -below). We ensure in the pseudocode that each instruction does at most -one register write to each register bit, and also that it does not try -to read a register value it just wrote.

-
-
-

Data-flow dependencies (address and data) in the model emerge from the -fact that each register read has to wait for the appropriate register -write to be executed (as described above).

-
-
-
-

B.3.2. Instruction Instance State

-
-

Each instruction instance _i has a state comprising:

-
-
-
    -
  • -

    program_loc, the memory address from which the instruction was -fetched;

    -
  • -
  • -

    instruction_kind, identifying whether this is a load, store, AMO, -fence, branch/jump or a simple instruction (this also includes a -kind similar to the one described for the pseudocode execution -states);

    -
  • -
  • -

    src_regs, the set of source _reg_name_s (including system -registers), as statically determined from the pseudocode of the -instruction;

    -
  • -
  • -

    dst_regs, the destination _reg_name_s (including system registers), -as statically determined from the pseudocode of the instruction;

    -
  • -
  • -

    pseudocode_state (or sometimes just state for short), one of (this -is a tagged union; tags in small-caps):

    -
  • -
-
- ---- - - - - - - - - - - - - - - - - -
Plain(isa_state)- ready to make a pseudocode transition

Pending_mem_loads(load_continuation)

- requesting memory load -operation(s)

Pending_mem_stores(store_continuation)

- requesting memory store -operation(s)

-
-
    -
  • -

    reg_reads, the register reads the instance has performed, including, -for each one, the register write slices it read from;

    -
  • -
  • -

    reg_writes, the register writes the instance has performed;

    -
  • -
  • -

    mem_loads, a set of memory load operations, and for each one the -as-yet-unsatisfied slices (the byte indices that have not been satisfied -yet), and, for the satisfied slices, the store slices (each consisting -of a memory store operation and subset of its byte indices) that -satisfied it.

    -
  • -
  • -

    mem_stores, a set of memory store operations, and for each one a -flag that indicates whether it has been propagated (passed to the shared -memory) or not.

    -
  • -
  • -

    information recording whether the instance is committed, finished, -etc.

    -
  • -
-
-
-

Each memory load operation includes a memory footprint (address and -size). Each memory store operations includes a memory footprint, and, -when available, a value.

-
-
-

A load instruction instance with a non-empty mem_loads, for which all -the load operations are satisfied (i.e. there are no unsatisfied load -slices) is said to be entirely satisfied.

-
-
-

Informally, an instruction instance is said to have fully determined -data if the load (and sc) instructions feeding its source registers -are finished. Similarly, it is said to have a fully determined memory -footprint if the load (and sc) instructions feeding its memory -operation address register are finished. Formally, we first define the -notion of fully determined register write: a register write -stem 7511f9c6a56927681a7d279e2d413cff from reg_writes of instruction instance -stem 6ac91b4e7dd35551c6ea477deba5f82d is said to be fully determined if one of the following -conditions hold:

-
-
-
    -
  1. -

    stem 6ac91b4e7dd35551c6ea477deba5f82d is finished; or

    -
  2. -
  3. -

    the value written by stem 7511f9c6a56927681a7d279e2d413cff is not affected by a memory -operation that stem 6ac91b4e7dd35551c6ea477deba5f82d has made (i.e. a value loaded from memory -or the result of sc), and, for every register read that -stem 6ac91b4e7dd35551c6ea477deba5f82d has made, that affects stem 7511f9c6a56927681a7d279e2d413cff, the register -write from which stem 6ac91b4e7dd35551c6ea477deba5f82d read is fully determined (or -stem 6ac91b4e7dd35551c6ea477deba5f82d read from the initial register state).

    -
  4. -
-
-
-

Now, an instruction instance stem 6ac91b4e7dd35551c6ea477deba5f82d is said to have fully -determined data if for every register read stem 6aec6bc26afaa12bc00c3daffd500eb1 from -reg_reads, the register writes that stem 6aec6bc26afaa12bc00c3daffd500eb1 reads from are -fully determined. An instruction instance stem 6ac91b4e7dd35551c6ea477deba5f82d is said to -have a fully determined memory footprint if for every register read -stem 6aec6bc26afaa12bc00c3daffd500eb1 from reg_reads that feeds into stem 6ac91b4e7dd35551c6ea477deba5f82d’s -memory operation address, the register writes that stem 6aec6bc26afaa12bc00c3daffd500eb1 reads -from are fully determined.

-
-
- - - - - -
- - -
-

The rmem tool records, for every register write, the set of register -writes from other instructions that have been read by this instruction -at the point of performing the write. By carefully arranging the -pseudocode of the instructions covered by the tool we were able to make -it so that this is exactly the set of register writes on which the write -depends on.

-
-
-
-
-
-

B.3.3. Hart State

-
-

The model state of a single hart comprises:

-
-
-
    -
  • -

    hart_id, a unique identifier of the hart;

    -
  • -
  • -

    initial_register_state, the initial register value for each -register;

    -
  • -
  • -

    initial_fetch_address, the initial instruction fetch address;

    -
  • -
  • -

    instruction_tree, a tree of the instruction instances that have been -fetched (and not discarded), in program order.

    -
  • -
-
-
-
-

B.3.4. Shared Memory State

-
-

The model state of the shared memory comprises a list of memory store -operations, in the order they propagated to the shared memory.

-
-
-

When a store operation is propagated to the shared memory it is simply -added to the end of the list. When a load operation is satisfied from -memory, for each byte of the load operation, the most recent -corresponding store slice is returned.

-
-
- - - - - -
- - -
-

For most purposes, it is simpler to think of the shared memory as an -array, i.e., a map from memory locations to memory store operation -slices, where each memory location is mapped to a one-byte slice of the -most recent memory store operation to that location. However, this -abstraction is not detailed enough to properly handle the sc -instruction. The RVWMO allows store operations from the same hart as the -sc to intervene between the store operation of the sc and the store -operations the paired lr read from. To allow such store operations to -intervene, and forbid others, the array abstraction must be extended to -record more information. Here, we use a list as it is very simple, but a -more efficient and scalable implementations should probably use -something better.

-
-
-
-
-
-

B.3.5. Transitions

-
-

Each of the paragraphs below describes a single kind of system -transition. The description starts with a condition over the current -system state. The transition can be taken in the current state only if -the condition is satisfied. The condition is followed by an action that -is applied to that state when the transition is taken, in order to -generate the new system state.

-
-
-
Fetch instruction
-
-

A possible program-order-successor of instruction instance -stem 6ac91b4e7dd35551c6ea477deba5f82d can be fetched from address loc if:

-
-
-
    -
  1. -

    it has not already been fetched, i.e., none of the immediate -successors of stem 6ac91b4e7dd35551c6ea477deba5f82d in the hart’s instruction_tree are from -loc; and

    -
  2. -
  3. -

    if stem 6ac91b4e7dd35551c6ea477deba5f82d’s pseudocode has already written an address to -pc, then loc must be that address, otherwise loc is:

    -
    -
      -
    • -

      for a conditional branch, the successor address or the branch target -address;

      -
    • -
    • -

      for a (direct) jump and link instruction (jal), the target address;

      -
    • -
    • -

      for an indirect jump instruction (jalr), any address; and

      -
    • -
    • -

      for any other instruction, stem 41660742bc942374cdffc1c2016f35fb.

      -
    • -
    -
    -
  4. -
-
-
-

Action: construct a freshly initialized instruction instance -stem fd12704256eca98f1c71f4fa1dde9dd8 for the instruction in the program memory at loc, -with state Plain(isa_state), computed from the instruction pseudocode, -including the static information available from the pseudocode such as -its instruction_kind, src_regs, and dst_regs, and add -stem fd12704256eca98f1c71f4fa1dde9dd8 to the hart’s instruction_tree as a successor of -stem 6ac91b4e7dd35551c6ea477deba5f82d.

-
-
-

The possible next fetch addresses (loc) are available immediately -after fetching stem 6ac91b4e7dd35551c6ea477deba5f82d and the model does not need to wait for -the pseudocode to write to pc; this allows out-of-order execution, and -speculation past conditional branches and jumps. For most instructions -these addresses are easily obtained from the instruction pseudocode. The -only exception to that is the indirect jump instruction (jalr), where -the address depends on the value held in a register. In principle the -mathematical model should allow speculation to arbitrary addresses here. -The exhaustive search in the rmem tool handles this by running the -exhaustive search multiple times with a growing set of possible next -fetch addresses for each indirect jump. The initial search uses empty -sets, hence there is no fetch after indirect jump instruction until the -pseudocode of the instruction writes to pc, and then we use that value -for fetching the next instruction. Before starting the next iteration of -exhaustive search, we collect for each indirect jump (grouped by code -location) the set of values it wrote to pc in all the executions in -the previous search iteration, and use that as possible next fetch -addresses of the instruction. This process terminates when no new fetch -addresses are detected.

-
-
-
-
Initiate memory load operations
-
-

An instruction instance stem 6ac91b4e7dd35551c6ea477deba5f82d in state Plain(Load_mem(kind, -address, size, load_continuation)) can always initiate the -corresponding memory load operations. Action:

-
-
-
    -
  1. -

    Construct the appropriate memory load operations stem 87e245174d1300073c38508ca7876c6f:

    -
    -
      -
    • -

      if address is aligned to size then stem 87e245174d1300073c38508ca7876c6f is a single -memory load operation of size bytes from address;

      -
    • -
    • -

      otherwise, stem 87e245174d1300073c38508ca7876c6f is a set of size memory load -operations, each of one byte, from the addresses -stem f82c08e4a3f84915b619aa6472e0f810.

      -
    • -
    -
    -
  2. -
  3. -

    set mem_loads of stem 6ac91b4e7dd35551c6ea477deba5f82d to stem 87e245174d1300073c38508ca7876c6f; and

    -
  4. -
  5. -

    update the state of stem 6ac91b4e7dd35551c6ea477deba5f82d to -Pending_mem_loads(load_continuation).

    -
  6. -
-
-
-
-
-

In Section 18.1.1 it is said that -misaligned memory accesses may be decomposed at any granularity. Here we -decompose them to one-byte accesses as this granularity subsumes all -others.

-
-
-
-
-
-
Satisfy memory load operation by forwarding from unpropagated stores
-
-

For a non-AMO load instruction instance stem 6ac91b4e7dd35551c6ea477deba5f82d in state -Pending_mem_loads(load_continuation), and a memory load operation -stem f1903748b6bb79515f22670686a6d90c in stem a6b8a0ea6d0d3d30d7c26a73378ca550 that has -unsatisfied slices, the memory load operation can be partially or -entirely satisfied by forwarding from unpropagated memory store -operations by store instruction instances that are program-order-before -stem 6ac91b4e7dd35551c6ea477deba5f82d if:

-
-
-
    -
  1. -

    all program-order-previous fence instructions with .sr and .pw -set are finished;

    -
  2. -
  3. -

    for every program-order-previous fence instruction, stem f9ab899994f3d644a9c2ab98a38de0c6, -with .sr and .pr set, and .pw not set, if stem f9ab899994f3d644a9c2ab98a38de0c6 is not -finished then all load instructions that are program-order-before -stem f9ab899994f3d644a9c2ab98a38de0c6 are entirely satisfied;

    -
  4. -
  5. -

    for every program-order-previous fence.tso instruction, -stem f9ab899994f3d644a9c2ab98a38de0c6, that is not finished, all load instructions that are -program-order-before stem f9ab899994f3d644a9c2ab98a38de0c6 are entirely satisfied;

    -
  6. -
  7. -

    if stem 6ac91b4e7dd35551c6ea477deba5f82d is a load-acquire-RCsc, all program-order-previous -store-releases-RCsc are finished;

    -
  8. -
  9. -

    if stem 6ac91b4e7dd35551c6ea477deba5f82d is a load-acquire-release, all -program-order-previous instructions are finished;

    -
  10. -
  11. -

    all non-finished program-order-previous load-acquire instructions are -entirely satisfied; and

    -
  12. -
  13. -

    all program-order-previous store-acquire-release instructions are -finished;

    -
  14. -
-
-
-

Let stem 082668fc86f03352833a0820f08bf3cc be the set of all unpropagated memory store -operation slices from non-sc store instruction instances that are -program-order-before stem 6ac91b4e7dd35551c6ea477deba5f82d and have already calculated the -value to be stored, that overlap with the unsatisfied slices of -stem f1903748b6bb79515f22670686a6d90c, and which are not superseded by intervening store -operations or store operations that are read from by an intervening -load. The last condition requires, for each memory store operation slice -stem f85095907f5d98a9545a2a897907c982 in stem 082668fc86f03352833a0820f08bf3cc from instruction -stem fd12704256eca98f1c71f4fa1dde9dd8:

-
-
-
    -
  • -

    that there is no store instruction program-order-between stem 6ac91b4e7dd35551c6ea477deba5f82d -and stem fd12704256eca98f1c71f4fa1dde9dd8 with a memory store operation overlapping -stem f85095907f5d98a9545a2a897907c982; and

    -
  • -
  • -

    that there is no load instruction program-order-between stem 6ac91b4e7dd35551c6ea477deba5f82d -and stem fd12704256eca98f1c71f4fa1dde9dd8 that was satisfied from an overlapping memory store -operation slice from a different hart.

    -
  • -
-
-
-

Action:

-
-
-
    -
  1. -

    update stem a6b8a0ea6d0d3d30d7c26a73378ca550 to indicate that -stem f1903748b6bb79515f22670686a6d90c was satisfied by stem 082668fc86f03352833a0820f08bf3cc; and

    -
  2. -
  3. -

    restart any speculative instructions which have violated coherence as -a result of this, i.e., for every non-finished instruction -stem fd12704256eca98f1c71f4fa1dde9dd8 that is a program-order-successor of stem 6ac91b4e7dd35551c6ea477deba5f82d, -and every memory load operation stem 8571a8a5d097d63350efc6d8da824055 of stem fd12704256eca98f1c71f4fa1dde9dd8 -that was satisfied from stem 1344dde66edf219ff9e7873942a34a33, if there exists a memory -store operation slice stem 80b56f226b89976549d536e6d2967ed7 in stem 1344dde66edf219ff9e7873942a34a33, and -an overlapping memory store operation slice from a different memory -store operation in stem 082668fc86f03352833a0820f08bf3cc, and stem 80b56f226b89976549d536e6d2967ed7 is not -from an instruction that is a program-order-successor of -stem 6ac91b4e7dd35551c6ea477deba5f82d, restart stem fd12704256eca98f1c71f4fa1dde9dd8 and its restart-dependents.

    -
  4. -
-
-
-

Where, the restart-dependents of instruction stem 7d6abfd7a8903d8827b35524704fd563 are:

-
-
-
    -
  • -

    program-order-successors of stem 7d6abfd7a8903d8827b35524704fd563 that have data-flow -dependency on a register write of stem 7d6abfd7a8903d8827b35524704fd563;

    -
  • -
  • -

    program-order-successors of stem 7d6abfd7a8903d8827b35524704fd563 that have a memory load -operation that reads from a memory store operation of stem 7d6abfd7a8903d8827b35524704fd563 -(by forwarding);

    -
  • -
  • -

    if stem 7d6abfd7a8903d8827b35524704fd563 is a load-acquire, all the program-order-successors -of stem 7d6abfd7a8903d8827b35524704fd563;

    -
  • -
  • -

    if stem 7d6abfd7a8903d8827b35524704fd563 is a load, for every fence, stem f9ab899994f3d644a9c2ab98a38de0c6, with -.sr and .pr set, and .pw not set, that is a -program-order-successor of stem 7d6abfd7a8903d8827b35524704fd563, all the load instructions -that are program-order-successors of stem f9ab899994f3d644a9c2ab98a38de0c6;

    -
  • -
  • -

    if stem 7d6abfd7a8903d8827b35524704fd563 is a load, for every fence.tso, stem f9ab899994f3d644a9c2ab98a38de0c6, -that is a program-order-successor of stem 7d6abfd7a8903d8827b35524704fd563, all the load -instructions that are program-order-successors of stem f9ab899994f3d644a9c2ab98a38de0c6; and

    -
  • -
  • -

    (recursively) all the restart-dependents of all the instruction -instances above.

    -
  • -
-
-
-
-
-

Forwarding memory store operations to a memory load might satisfy only -some slices of the load, leaving other slices unsatisfied.

-
-
-

A program-order-previous store operation that was not available when -taking the transition above might make stem 082668fc86f03352833a0820f08bf3cc provisionally -unsound (violating coherence) when it becomes available. That store will -prevent the load from being finished (see Finish instruction), and will cause it to -restart when that store operation is propagated (see Propagate store operation).

-
-
-

A consequence of the transition condition above is that -store-release-RCsc memory store operations cannot be forwarded to -load-acquire-RCsc instructions: stem 082668fc86f03352833a0820f08bf3cc does not include -memory store operations from finished stores (as those must be -propagated memory store operations), and the condition above requires -all program-order-previous store-releases-RCsc to be finished when the -load is acquire-RCsc.

-
-
-
-
-
-
Satisfy memory load operation from memory
-
-

For an instruction instance stem 6ac91b4e7dd35551c6ea477deba5f82d of a non-AMO load -instruction or an AMO instruction in the context of the Saitsfy, commit and propagate operations of an AMO transition, -any memory load operation stem f1903748b6bb79515f22670686a6d90c in -stem a6b8a0ea6d0d3d30d7c26a73378ca550 that has unsatisfied slices, can be -satisfied from memory if all the conditions of <sat_by_forwarding, Saitsfy memory load operation by forwarding from unpropagated stores>> are satisfied. Action: -let stem 082668fc86f03352833a0820f08bf3cc be the memory store operation slices from memory -covering the unsatisfied slices of stem f1903748b6bb79515f22670686a6d90c, and apply the -action of Satisfy memory operation by forwarding from unpropagates stores.

-
-
- - - - - -
- - -
-

Note that Satisfy memory operation by forwarding from unpropagates stores might leave some slices of the memory load operation -unsatisfied, those will have to be satisfied by taking the transition -again, or taking Satisfy memory load operation from memory. Satisfy memory load operation from memory, on the other hand, will always satisfy all the -unsatisfied slices of the memory load operation.

-
-
-
-
-
-
Complete load operations
-
-

A load instruction instance stem 6ac91b4e7dd35551c6ea477deba5f82d in state -Pending_mem_loads(load_continuation) can be completed (not to be -confused with finished) if all the memory load operations -stem a6b8a0ea6d0d3d30d7c26a73378ca550 are entirely satisfied (i.e. there -are no unsatisfied slices). Action: update the state of stem 6ac91b4e7dd35551c6ea477deba5f82d -to Plain(load_continuation(mem_value)), where mem_value is assembled -from all the memory store operation slices that satisfied -stem a6b8a0ea6d0d3d30d7c26a73378ca550.

-
-
-
-
Early sc fail
-
-

An sc instruction instance stem 6ac91b4e7dd35551c6ea477deba5f82d in state -Plain(Early_sc_fail(res_continuation)) can always be made to fail. -Action: update the state of stem 6ac91b4e7dd35551c6ea477deba5f82d to -Plain(res_continuation(false)).

-
-
-
-
Paired sc
-
-

An sc instruction instance stem 6ac91b4e7dd35551c6ea477deba5f82d in state -Plain(Early_sc_fail(res_continuation)) can continue its (potentially -successful) execution if stem 6ac91b4e7dd35551c6ea477deba5f82d is paired with an lr. Action: -update the state of stem 6ac91b4e7dd35551c6ea477deba5f82d to Plain(res_continuation(true)).

-
-
-
-
Initiate memory store operation footprints
-
-

An instruction instance stem 6ac91b4e7dd35551c6ea477deba5f82d in state Plain(Store_ea(kind, -address, size, next_state)) can always announce its pending memory -store operation footprint. Action:

-
-
-
    -
  1. -

    construct the appropriate memory store operations stem f85095907f5d98a9545a2a897907c982 -(without the store value):

    -
    -
      -
    • -

      if address is aligned to size then stem f85095907f5d98a9545a2a897907c982 is a single -memory store operation of size bytes to address;

      -
    • -
    • -

      otherwise, stem f85095907f5d98a9545a2a897907c982 is a set of size memory store -operations, each of one-byte size, to the addresses -stem f82c08e4a3f84915b619aa6472e0f810.

      -
    • -
    -
    -
  2. -
  3. -

    set stem a9642b9ca23ba78275edb15d213c9b95 to stem f85095907f5d98a9545a2a897907c982; and

    -
  4. -
  5. -

    update the state of stem 6ac91b4e7dd35551c6ea477deba5f82d to Plain(next_state).

    -
  6. -
-
-
-
-
-

Note that after taking the transition above the memory store operations -do not yet have their values. The importance of splitting this -transition from the transition below is that it allows other -program-order-successor store instructions to observe the memory -footprint of this instruction, and if they don’t overlap, propagate out -of order as early as possible (i.e. before the data register value -becomes available).

-
-
-
-
-
-
Instantiate memory store operation values
-
-

An instruction instance stem 6ac91b4e7dd35551c6ea477deba5f82d in state -Plain(Store_memv(mem_value, store_continuation)) can always -instantiate the values of the memory store operations -stem a9642b9ca23ba78275edb15d213c9b95. Action:

-
-
-
    -
  1. -

    split mem_value between the memory store operations -stem a9642b9ca23ba78275edb15d213c9b95; and

    -
  2. -
  3. -

    update the state of stem 6ac91b4e7dd35551c6ea477deba5f82d to -Pending_mem_stores(store_continuation).

    -
  4. -
-
-
-
-
Commit store instruction
-
-

An uncommitted instruction instance stem 6ac91b4e7dd35551c6ea477deba5f82d of a non-sc store -instruction or an sc instruction in the context of the Commit and propagate store operation of an sc -transition, in state Pending_mem_stores(store_continuation), can be -committed (not to be confused with propagated) if:

-
-
-
    -
  1. -

    stem 6ac91b4e7dd35551c6ea477deba5f82d has fully determined data;

    -
  2. -
  3. -

    all program-order-previous conditional branch and indirect jump -instructions are finished;

    -
  4. -
  5. -

    all program-order-previous fence instructions with .sw set are -finished;

    -
  6. -
  7. -

    all program-order-previous fence.tso instructions are finished;

    -
  8. -
  9. -

    all program-order-previous load-acquire instructions are finished;

    -
  10. -
  11. -

    all program-order-previous store-acquire-release instructions are -finished;

    -
  12. -
  13. -

    if stem 6ac91b4e7dd35551c6ea477deba5f82d is a store-release, all program-order-previous -instructions are finished;

    -
  14. -
  15. -

    all program-order-previous memory access instructions have a fully -determined memory footprint;

    -
  16. -
  17. -

    all program-order-previous store instructions, except for sc that failed, -have initiated and so have non-empty mem_stores; and

    -
  18. -
  19. -

    all program-order-previous load instructions have initiated and so have -non-empty mem_loads.

    -
  20. -
-
-
-

Action: record that i is committed.

-
-
- - - - - -
- - -
-

Notice that if condition -8 is satisfied -the conditions -9 and -10 are also -satisfied, or will be satisfied after taking some eager transitions. -Hence, requiring them does not strengthen the model. By requiring them, -we guarantee that previous memory access instructions have taken enough -transitions to make their memory operations visible for the condition -check of , which is the next transition the instruction will take, -making that condition simpler.

-
-
-
-
-
-
Propagate store operation
-
-

For a committed instruction instance stem 6ac91b4e7dd35551c6ea477deba5f82d in state -Pending_mem_stores(store_continuation), and an unpropagated memory -store operation stem b406cb52bbdcb9c7799e21d181e685c1 in -stem a9642b9ca23ba78275edb15d213c9b95, stem b406cb52bbdcb9c7799e21d181e685c1 can be -propagated if:

-
-
-
    -
  1. -

    all memory store operations of program-order-previous store -instructions that overlap with stem b406cb52bbdcb9c7799e21d181e685c1 have already -propagated;

    -
  2. -
  3. -

    all memory load operations of program-order-previous load instructions -that overlap with stem b406cb52bbdcb9c7799e21d181e685c1 have already been satisfied, and -(the load instructions) are non-restartable (see definition below); -and

    -
  4. -
  5. -

    all memory load operations that were satisfied by forwarding -stem b406cb52bbdcb9c7799e21d181e685c1 are entirely satisfied.

    -
  6. -
-
-
-

Where a non-finished instruction instance stem 7d6abfd7a8903d8827b35524704fd563 is -non-restartable if:

-
-
-
    -
  1. -

    there does not exist a store instruction stem 0d6d0a3f5bcc5a27d3ba5cf91524b5a4 and an -unpropagated memory store operation stem b406cb52bbdcb9c7799e21d181e685c1 of stem 0d6d0a3f5bcc5a27d3ba5cf91524b5a4 -such that applying the action of the Propagate store operation transition to -stem b406cb52bbdcb9c7799e21d181e685c1 will result in the restart of stem 7d6abfd7a8903d8827b35524704fd563; and

    -
  2. -
  3. -

    there does not exist a non-finished load instruction stem 65b1894f130a02fb32beeead9ca75d74 -and a memory load operation stem f1903748b6bb79515f22670686a6d90c of stem 65b1894f130a02fb32beeead9ca75d74 such -that applying the action of the Satisfy memory load operation by forwarding from unpropagated stores/Satisfy memory load operation from memory transition (even if -stem f1903748b6bb79515f22670686a6d90c is already satisfied) to stem f1903748b6bb79515f22670686a6d90c will result -in the restart of stem 7d6abfd7a8903d8827b35524704fd563.

    -
  4. -
-
-
-

Action:

-
-
-
    -
  1. -

    update the shared memory state with stem b406cb52bbdcb9c7799e21d181e685c1;

    -
  2. -
  3. -

    update stem a9642b9ca23ba78275edb15d213c9b95 to indicate that -stem b406cb52bbdcb9c7799e21d181e685c1 was propagated; and

    -
  4. -
  5. -

    restart any speculative instructions which have violated coherence as -a result of this, i.e., for every non-finished instruction -stem fd12704256eca98f1c71f4fa1dde9dd8 program-order-after stem 6ac91b4e7dd35551c6ea477deba5f82d and every memory -load operation stem 8571a8a5d097d63350efc6d8da824055 of stem fd12704256eca98f1c71f4fa1dde9dd8 that was satisfied -from stem 1344dde66edf219ff9e7873942a34a33, if there exists a memory store operation -slice stem 80b56f226b89976549d536e6d2967ed7 in stem 1344dde66edf219ff9e7873942a34a33 that overlaps with -stem b406cb52bbdcb9c7799e21d181e685c1 and is not from stem b406cb52bbdcb9c7799e21d181e685c1, and -stem 80b56f226b89976549d536e6d2967ed7 is not from a program-order-successor of -stem 6ac91b4e7dd35551c6ea477deba5f82d, restart stem fd12704256eca98f1c71f4fa1dde9dd8 and its restart-dependents -(see Satisfy memory load operation by forwarding from unpropagated stores).

    -
  6. -
-
-
-
-
Commit and propagate store operation of an sc
-
-

An uncommitted sc instruction instance stem 6ac91b4e7dd35551c6ea477deba5f82d, from hart -stem af98b3d273f2fa50eec5140dd48d1eae, in state Pending_mem_stores(store_continuation), with -a paired lr stem fd12704256eca98f1c71f4fa1dde9dd8 that has been satisfied by some store -slices stem 082668fc86f03352833a0820f08bf3cc, can be committed and propagated at the same -time if:

-
-
-
    -
  1. -

    stem fd12704256eca98f1c71f4fa1dde9dd8 is finished;

    -
  2. -
  3. -

    every memory store operation that has been forwarded to -stem fd12704256eca98f1c71f4fa1dde9dd8 is propagated;

    -
  4. -
  5. -

    the conditions of Commit store instruction is satisfied;

    -
  6. -
  7. -

    the conditions of Propagate store instruction is satisfied (notice that an sc instruction can -only have one memory store operation); and

    -
  8. -
  9. -

    for every store slice stem f85095907f5d98a9545a2a897907c982 from stem 082668fc86f03352833a0820f08bf3cc, -stem f85095907f5d98a9545a2a897907c982 has not been overwritten, in the shared memory, by a -store that is from a hart that is not stem af98b3d273f2fa50eec5140dd48d1eae, at any point -since stem f85095907f5d98a9545a2a897907c982 was propagated to memory.

    -
  10. -
-
-
-

Action:

-
-
-
    -
  1. -

    apply the actions of Commit store instruction; and

    -
  2. -
  3. -

    apply the action of Propagate store instruction.

    -
  4. -
-
-
-
-
Late sc fail
-
-

An sc instruction instance stem 6ac91b4e7dd35551c6ea477deba5f82d in state -Pending_mem_stores(store_continuation), that has not propagated its -memory store operation, can always be made to fail. Action:

-
-
-
    -
  1. -

    clear stem a9642b9ca23ba78275edb15d213c9b95; and

    -
  2. -
  3. -

    update the state of stem 6ac91b4e7dd35551c6ea477deba5f82d to -Plain(store_continuation(false)).

    -
  4. -
-
-
-
-
-

For efficiency, the rmem tool allows this transition only when it is -not possible to take the Commit and propagate store operation of an sc transition. This does not affect the set of -allowed final states, but when explored interactively, if the sc -should fail one should use the Eaarly sc fail transition instead of waiting for this transition.

-
-
-
-
-
-
Complete store operations
-
-

A store instruction instance stem 6ac91b4e7dd35551c6ea477deba5f82d in state -Pending_mem_stores(store_continuation), for which all the memory store -operations in stem a9642b9ca23ba78275edb15d213c9b95 have been propagated, -can always be completed (not to be confused with finished). Action: -update the state of stem 6ac91b4e7dd35551c6ea477deba5f82d to -Plain(store_continuation(true)).

-
-
-
-
Satisfy, commit and propagate operations of an AMO
-
-

An AMO instruction instance stem 6ac91b4e7dd35551c6ea477deba5f82d in state -Pending_mem_loads(load_continuation) can perform its memory access if -it is possible to perform the following sequence of transitions with no -intervening transitions:

-
- -
-

and in addition, the condition of Finish instruction, with the exception of not requiring -stem 6ac91b4e7dd35551c6ea477deba5f82d to be in state Plain(Done), holds after those -transitions. Action: perform the above sequence of transitions (this -does not include Finish instruction), one after the other, with no intervening -transitions.

-
-
- - - - - -
- - -
-

Notice that program-order-previous stores cannot be forwarded to the -load of an AMO. This is simply because the sequence of transitions above -does not include the forwarding transition. But even if it did include -it, the sequence will fail when trying to do the Propagate store operation transition, as this -transition requires all program-order-previous store operations to -overlapping memory footprints to be propagated, and forwarding requires -the store operation to be unpropagated.

-
-
-

In addition, the store of an AMO cannot be forwarded to a -program-order-successor load. Before taking the transition above, the -store operation of the AMO does not have its value and therefore cannot -be forwarded; after taking the transition above the store operation is -propagated and therefore cannot be forwarded.

-
-
-
-
-
-
Commit fence
-
-

A fence instruction instance stem 6ac91b4e7dd35551c6ea477deba5f82d in state -Plain(Fence(kind, next_state)) can be committed if:

-
-
-
    -
  1. -

    if stem 6ac91b4e7dd35551c6ea477deba5f82d is a normal fence and it has .pr set, all -program-order-previous load instructions are finished;

    -
  2. -
  3. -

    if stem 6ac91b4e7dd35551c6ea477deba5f82d is a normal fence and it has .pw set, all -program-order-previous store instructions are finished; and

    -
  4. -
  5. -

    if stem 6ac91b4e7dd35551c6ea477deba5f82d is a fence.tso, all program-order-previous load -and store instructions are finished.

    -
  6. -
-
-
-

Action:

-
-
-
    -
  1. -

    record that stem 6ac91b4e7dd35551c6ea477deba5f82d is committed; and

    -
  2. -
  3. -

    update the state of stem 6ac91b4e7dd35551c6ea477deba5f82d to Plain(next_state).

    -
  4. -
-
-
-
-
Register read
-
-

An instruction instance stem 6ac91b4e7dd35551c6ea477deba5f82d in state -Plain(Read_reg(reg_name, read_cont)) can do a register read of -reg_name if every instruction instance that it needs to read from has -already performed the expected reg_name register write.

-
-
-

Let read_sources include, for each bit of reg_name, the write to -that bit by the most recent (in program order) instruction instance that -can write to that bit, if any. If there is no such instruction, the -source is the initial register value from initial_register_state. Let -reg_value be the value assembled from read_sources. Action:

-
-
-
    -
  1. -

    add reg_name to stem e4c7338ef90c20ab9f1e25877c063865 with -read_sources and reg_value; and

    -
  2. -
  3. -

    update the state of stem 6ac91b4e7dd35551c6ea477deba5f82d to Plain(read_cont(reg_value)).

    -
  4. -
-
-
-
-
Register write
-
-

An instruction instance stem 6ac91b4e7dd35551c6ea477deba5f82d in state -Plain(Write_reg(reg_name, reg_value, next_state)) can always do a -reg_name register write. Action:

-
-
-
    -
  1. -

    add reg_name to stem 106f9ebc134bdfd3291c4c84455ac2db with -stem 6dc98c92215c61f2018efb3df23af251 and reg_value; and

    -
  2. -
  3. -

    update the state of stem 6ac91b4e7dd35551c6ea477deba5f82d to Plain(next_state).

    -
  4. -
-
-
-

where stem 6dc98c92215c61f2018efb3df23af251 is a pair of the set of all read_sources from -stem e4c7338ef90c20ab9f1e25877c063865, and a flag that is true iff -stem 6ac91b4e7dd35551c6ea477deba5f82d is a load instruction instance that has already been -entirely satisfied.

-
-
-
-
Pseudocode internal step
-
-

An instruction instance stem 6ac91b4e7dd35551c6ea477deba5f82d in state -Plain(Internal(next_state)) can always do that pseudocode-internal -step. Action: update the state of stem 6ac91b4e7dd35551c6ea477deba5f82d to -Plain(next_state).

-
-
-
-
Finish instruction
-
-

A non-finished instruction instance stem 6ac91b4e7dd35551c6ea477deba5f82d in state Plain(Done) -can be finished if:

-
-
-
    -
  1. -

    if stem 6ac91b4e7dd35551c6ea477deba5f82d is a load instruction:

    -
    -
      -
    1. -

      all program-order-previous load-acquire instructions are finished;

      -
    2. -
    3. -

      all program-order-previous fence instructions with .sr set are -finished;

      -
    4. -
    5. -

      for every program-order-previous fence.tso instruction, -stem f9ab899994f3d644a9c2ab98a38de0c6, that is not finished, all load instructions that are -program-order-before stem f9ab899994f3d644a9c2ab98a38de0c6 are finished; and

      -
    6. -
    7. -

      it is guaranteed that the values read by the memory load operations -of stem 6ac91b4e7dd35551c6ea477deba5f82d will not cause coherence violations, i.e., for any -program-order-previous instruction instance stem fd12704256eca98f1c71f4fa1dde9dd8, let -stem 6863bf3b75317b35ee75c32d6a757974 be the combined footprint of propagated -memory store operations from store instructions program-order-between -stem 6ac91b4e7dd35551c6ea477deba5f82d and stem fd12704256eca98f1c71f4fa1dde9dd8, and fixed memory store -operations that were forwarded to stem 6ac91b4e7dd35551c6ea477deba5f82d from store -instructions program-order-between stem 6ac91b4e7dd35551c6ea477deba5f82d and stem fd12704256eca98f1c71f4fa1dde9dd8 -including stem fd12704256eca98f1c71f4fa1dde9dd8, and let -stem f07d30fa3303b30b878e98038c45b996 be the complement of -stem 6863bf3b75317b35ee75c32d6a757974 in the memory footprint of stem 6ac91b4e7dd35551c6ea477deba5f82d. -If stem f07d30fa3303b30b878e98038c45b996 is not empty:

      -
      -
        -
      1. -

        stem fd12704256eca98f1c71f4fa1dde9dd8 has a fully determined memory footprint;

        -
      2. -
      3. -

        stem fd12704256eca98f1c71f4fa1dde9dd8 has no unpropagated memory store operations that -overlap with stem f07d30fa3303b30b878e98038c45b996; and

        -
      4. -
      5. -

        if stem fd12704256eca98f1c71f4fa1dde9dd8 is a load with a memory footprint that overlaps -with stem f07d30fa3303b30b878e98038c45b996, then all the memory load -operations of stem fd12704256eca98f1c71f4fa1dde9dd8 that overlap with -stem f07d30fa3303b30b878e98038c45b996 are satisfied and stem fd12704256eca98f1c71f4fa1dde9dd8 -is non-restartable (see the Propagate store operation transition for how to determined if an -instruction is non-restartable).

        -
        -

        Here, a memory store operation is called fixed if the store instruction -has fully determined data.

        -
        -
      6. -
      -
      -
    8. -
    -
    -
  2. -
  3. -

    stem 6ac91b4e7dd35551c6ea477deba5f82d has a fully determined data; and

    -
  4. -
  5. -

    if stem 6ac91b4e7dd35551c6ea477deba5f82d is not a fence, all program-order-previous -conditional branch and indirect jump instructions are finished.

    -
  6. -
-
-
-

Action:

-
-
-
    -
  1. -

    if stem 6ac91b4e7dd35551c6ea477deba5f82d is a conditional branch or indirect jump -instruction, discard any untaken paths of execution, i.e., remove all -instruction instances that are not reachable by the branch/jump taken in -instruction_tree; and

    -
  2. -
  3. -

    record the instruction as finished, i.e., set finished to true.

    -
  4. -
-
-
-
-
-

B.3.6. Limitations

-
-
    -
  • -

    The model covers user-level RV64I and RV64A. In particular, it does -not support the misaligned atomicity granule PMA or the total store -ordering extension "Ztso". It should be trivial to adapt the model to -RV32I/A and to the G, Q and C extensions, but we have never tried it. -This will involve, mostly, writing Sail code for the instructions, with -minimal, if any, changes to the concurrency model.

    -
  • -
  • -

    The model covers only normal memory accesses (it does not handle I/O -accesses).

    -
  • -
  • -

    The model does not cover TLB-related effects.

    -
  • -
  • -

    The model assumes the instruction memory is fixed. In particular, the -Fetch instruction transition does not generate memory load operations, and the shared -memory is not involved in the transition. Instead, the model depends on -an external oracle that provides an opcode when given a memory location.

    -
  • -
  • -

    The model does not cover exceptions, traps and interrupts.

    -
  • -
-
-
-
-
-
-
-

Appendix C: Vector Assembly Code Examples

-
-
-

CV32A65X: This appendix is not applicable as vector extension is -not supported.

-
-
-
-
-

Appendix D: Calling Convention for Vector State (Not authoritative - Placeholder Only)

-
-
-

CV32A65X: This appendix is not applicable as vector extension is -not supported.

-
-
-
-
-

Index

-
- -
-
-
-

Bibliography

-
-
-

RISC-V ELF psABI Specification. github.com/riscv/riscv-elf-psabi-doc/ .

-
-
-

RISC-V Assembly Programmer’s Manual. github.com/riscv/riscv-asm-manual .

-
-
-

IEEE Standard for a 32-bit microprocessor. (1994). IEEE Std. 1754-1994.

-
-
-

ANSI/IEEE Std 754-2008, IEEE standard for floating-point arithmetic. (2008). "Institute of Electrical and Electronic Engineers".

-
-
-

Amdahl, G. M., Blaauw, G. A., & F. P. Brooks, J. (1964). Architecture of the IBM System/360. IBM Journal of R. & D., 8(2).

-
-
-

Buchholz, W. (1962). Planning a computer system: Project Stretch. McGraw-Hill Book Company.

-
-
-

Heil, T. H., & Smith, J. E. (1996). Selective Dual Path Execution. University of Wisconsin - Madison.

-
-
-

Katevenis, M. G. H., Sherburne, R. W., Jr., Patterson, D. A., & Séquin, C. H. (1983, August). The RISC II micro-architecture. Proceedings VLSI 83 Conference.

-
-
-

Kim, H., Mutlu, O., Stark, J., & Patt, Y. N. (2005). Wish Branches: Combining Conditional Branching and Predication for Adaptive Predicated Execution. Proceedings of the 38th Annual IEEE/ACM International Symposium on Microarchitecture, 43–54.

-
-
-

Klauser, A., Austin, T., Grunwald, D., & Calder, B. (1998). Dynamic Hammock Predication for Non-Predicated Instruction Set Architectures. Proceedings of the 1998 International Conference on Parallel Architectures and Compilation Techniques.

-
-
-

Lee, D. D., Kong, S. I., Hill, M. D., Taylor, G. S., Hodges, D. A., Katz, R. H., & Patterson, D. A. (1989). A VLSI Chip Set for a Multiprocessor Workstation–Part I: An RISC Microprocessor with Coprocessor Interface and Support for Symbolic Processing. IEEE JSSC, 24(6), 1688–1698.

-
-
-

OpenCores. (2012). OpenRISC 1000 Architecture Manual, Architecture Version 1.0.

-
-
-

Pan, H., Hindman, B., & Asanović, K. (2009, March). Lithe: Enabling Efficient Composition of Parallel Libraries. Proceedings of the 1st USENIX Workshop on Hot Topics in Parallelism (HotPar ’09).

-
-
-

Pan, H., Hindman, B., & Asanović, K. (2010, June). Composing Parallel Software Efficiently with Lithe. 31st Conference on Programming Language Design and Implementation.

-
-
-

Patterson, D. A., & Séquin, C. H. (1981). RISC I: A Reduced Instruction Set VLSI Computer. ISCA, 443–458.

-
-
-

Sinharoy, B., Kalla, R., Starke, W. J., Le, H. Q., Cargnoni, R., Van Norstrand, J. A., Ronchetti, B. J., Stuecheli, J., Leenstra, J., Guthrie, G. L., Nguyen, D. Q., Blaner, B., Marino, C. F., Retter, E., & Williams, P. (2011). IBM POWER7 multicore server processor. IBM Journal of Research and Development, 55(3), 1–1.

-
-
-

Thornton, J. E. (1965). Parallel Operation in the Control Data 6600. Proceedings of the October 27-29, 1964, Fall Joint Computer Conference, Part II: Very High Speed Computer Systems, 33–40.

-
-
-

Tseng, J., & Asanović, K. (2000). Energy-Efficient Register Access. Proc. of the 13th Symposium on Integrated Circuits and Systems Design, 377–384.

-
-
-

Ungar, D., Blau, R., Foley, P., Samples, D., & Patterson, D. (1984). Architecture of SOAR: Smalltalk on a RISC. ISCA, 188–197.

-
-
-

Waterman, A. (2011). Improving Energy Efficiency and Reducing Code Size with RISC-V Compressed (Issue UCB/EECS-2011-63) [Master’s thesis]. University of California, Berkeley.

-
-
-

Waterman, A. (2016). Design of the RISC-V Instruction Set Architecture (Issue UCB/EECS-2016-1) [PhD thesis]. University of California, Berkeley.

-
-
-

Waterman, A., Lee, Y., Patterson, D. A., & Asanović, K. (2011). The RISC-V Instruction Set Manual, Volume I: Base User-Level ISA (UCB/EECS-2011-62; Issue UCB/EECS-2011-62). EECS Department, University of California, Berkeley.

-
-
-

Waterman, A., Lee, Y., Patterson, D. A., & Asanović, K. (2014). The RISC-V Instruction Set Manual, Volume I: Base User-Level ISA Version 2.0 (UCB/EECS-2014-54; Issue UCB/EECS-2014-54). EECS Department, University of California, Berkeley.

-
-
-
-
- - - \ No newline at end of file diff --git a/docs/04_cv32a65x/tristan/README.md b/docs/04_cv32a65x/tristan/README.md new file mode 100644 index 0000000000..9b7111edc7 --- /dev/null +++ b/docs/04_cv32a65x/tristan/README.md @@ -0,0 +1,6 @@ +This directory contains documents written for Tristan project. + +- [Verification Specifications](./verif-spec/verification_specifications.adoc) +This document describes the CVA6 verification strategy and implementation. +- [Tandem-Based Verification](./tandem-verification/tandem.adoc) +This document describes the CVA6 tandem (lockstep) verification infrastructure and contains associated User and Reference Manuals. diff --git a/docs/04_cv32a65x/tristan/tandem-verification/figures/Spike-simulation-scope.svg b/docs/04_cv32a65x/tristan/tandem-verification/figures/Spike-simulation-scope.svg new file mode 100644 index 0000000000..1e6e8103a6 --- /dev/null +++ b/docs/04_cv32a65x/tristan/tandem-verification/figures/Spike-simulation-scope.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/04_cv32a65x/tristan/tandem-verification/figures/overall-tandem-architecture.svg b/docs/04_cv32a65x/tristan/tandem-verification/figures/overall-tandem-architecture.svg new file mode 100644 index 0000000000..9461433a86 --- /dev/null +++ b/docs/04_cv32a65x/tristan/tandem-verification/figures/overall-tandem-architecture.svg @@ -0,0 +1 @@ +RVFI compareEqual?Report mismatchClockcycleCommit or trapDUTRVFI interfaceNoYesSystemVerilogInjectinterruptsStepsimulationInject«previous» CSR valuesEmit RVFI outputC++RVFI + DPIRVFI+DPIvalid=1 \ No newline at end of file diff --git a/docs/04_cv32a65x/tristan/tandem-verification/figures/refmodel-step-stages.svg b/docs/04_cv32a65x/tristan/tandem-verification/figures/refmodel-step-stages.svg new file mode 100644 index 0000000000..9e26f6ddcd --- /dev/null +++ b/docs/04_cv32a65x/tristan/tandem-verification/figures/refmodel-step-stages.svg @@ -0,0 +1 @@ + \ No newline at end of file diff --git a/docs/04_cv32a65x/tristan/tandem-verification/reference.adoc b/docs/04_cv32a65x/tristan/tandem-verification/reference.adoc new file mode 100644 index 0000000000..35109f8772 --- /dev/null +++ b/docs/04_cv32a65x/tristan/tandem-verification/reference.adoc @@ -0,0 +1,544 @@ +== Programming Reference Manual + +=== Configuration parameter names and scopes + +Parameters are referenced by their hierarchical names of the form `PREFIX/PARAM_NAME` that enable a tree-like hierachical structure. Supported values of `PREFIX` are: + +* `/top`: parameter values apply to the complete platform (which may consist of several cores.) +* `/top/cores`: parameter values apply to all harts, and can be overridden on a core-by-core basis using core-specific values. +* `/top/core/`: parameter values apply only to the core with ID ``. + +Parameters are explicitly typed. Three parameter types are supported: + +* `bool`: Boolean type with two values: `True` (or integer `1`) and `False` (or integer `0`). +* `uint64_t`: A 64-bit unsigned integer, represented using any of the formats supported in the given context (SystemVerilog or C/C++). +* `string`: A string, appropriately quoted depending on use context (SystemVerilog, C++ source code or Spike command line in Unix shell). + +Parameter names are not constrained by the C/C++/SystemVerilog identifier syntax. However, they cannot contain spaces, forward slash (`/`), backslash (`\`) nor double-quote (`"`) characters. + +Some parameters are only meaningful in specific scopes: either platform (`/top/`), or core (`/top/cores/` or `/top/core//`). + +=== Parameter index + +==== Platform-level parameters + +[cols="1,1,1,5"] +|=== +|Name |Type |Default |Description + +|`bootrom` +|bool +|`True` +|`True` if platform contains a boot ROM + +|`bootrom_base` +|uint64_t +|`0x10000` +|Base address of boot ROM (unused if boot ROM not present) + +|`bootrom_size` +|uint64_t +|`0x1000` +|Size of boot ROM (unused if boot ROM not present) + +|`dram_base` +|uint64_t +|`0x80000000` +|Base address of the DRAM + +|`dram_enable` +|bool +|`True` +|Presence of the DRAM + +|`dram_size` +|uint64_t +|`0x40000000` +|Size of the DRAM + +|`generic_core_config` +|bool +|`True` +|Use a common core configuration for all cores. + +|`isa` +|string +|`"RV64IMAFDC_zicntr_zihpm"` +|Default ISA string including any standard extensions, in canonical form. + +|`log_commits` +|bool +|`True` +|If `True`, generate Spike log of instruction commits and exceptions. + +|`max_steps` +|uint64_t +|`2000000UL` +|Maximum number of steps (instruction commits and trap entries) before terminating the ISS simulation if `max_steps_enable` is set to `True`. + +|`max_steps_enabled` +|bool +|`False` +|If `True`, obey the limit set in `max_steps`. + +|`num_procs` +|uint64_t +|`1` +|Number of cores (harts) in the platform. +|=== + +==== Per-CSR parameters + +All CSRs of cores in the platform can be controlled wrt. their presence ("availability") using a per-CSR parameter. For each CSR that is available, per-CSR parameters control the initial value, the writable bits and the overall writability of the register. + +Depending on the parameter prefix used (`/top/core//` or `/top/cores/`), the parameters will apply to the CSR of a specific core or they will set a default behavior for the given CSR in all cores. + +The supported per-CSR parameters are listed in the table below. `` stands for any valid CSR name recognized by the reference model simulator, in lowercase (as per RISC-V specification). + +[cols="1,1,1,5"] +|=== +|Name |Type |Default |Description + +|`_accessible` +|bool +|`True` +|`True` if register `` should be accessible in machine mode. Value `False` will cause an "illegal instruction" exception to be raised on any access to the register. + +|`_implemented` +|bool +|`True` +|`True` if register `` is implemented. Value `False` will cause any writes to this register to be discarded. + +|`_override_mask` +|uint64_t +|`0x0` +|Mask of bits that should be replaced in the default reset value of ``. + +|`_override_value` +|uint64_t +|`0x0` +|Value to use as a replacement of the default reset value of ``. + +|`_write_mask` +|uint64_t +|`~0x0` (all-ones) +|Mask of writable bits in register ``. Bits which are set allow writing into corresponding bits of ``. +|=== + +=== Core-level parameters + +Depending on the prefix used (`/top/cores/` or `/top/core/`) the core-level parameters form a default for all cores or apply to a specific core with ID ``. + +[cols="1,1,1,5"] +|=== +|Name |Type |Default |Description + +|`boot_addr` +|uint64_t +|`0x10000` +|Boot address for core(s). + +|`csr_counters_injection` +|bool +|`False` +|If `True`, inject values of microarchitecture-dependent counter CSRs from RTL into the reference model. Affected CSRs are `cycle`, `cycleh`, `mcycle`, `mcycleh` and `mip`. + +|`debug_exception_handler_addr` +|uint64_t +|`0x1a140000` +|Address of exception handler for exceptions occurring in debug mode + +|`debug_handler_addr` +|uint64_t +|`0x1a110800` +|Address of the debug handler + +|`debug_injection` +|bool +|`True` +|If `True`, inject debug events from RTL. + +|`extensions` +|string +|`""` +|Comma-separated list of Spike extensions to load. + +Extensions currently supported: + +- `cvxif`: implements the CV-X-IF interface; + +- `cv32a60x`: implements CSRs specific to the `CV32A6*X` cores. + +|`hide_csrs_based_on_priv` +|bool +|`False` +|Allow to mark CSRs as "not available" based on the privilege modes available. By default, Spike makes User and Supervisor mode registers accessible, even if Machine privilege level is the only level implemented. + +|`interrupts_injection` +|bool +|`True` +|If `True`, inject interrupts according to value passed from the RTL simulation in `mcause` CSR into the reference mode. + +|`isa` +|string +|`"RV32GC"` +|ISA string including any standard extensions, in canonical form. + +|`marchid` +|uint64_t +|`0x3` +|Value to be returned when reading the read-only CSR ``. + +|`mhartid` +|uint64_t +|`0x0` +|Value to be returned when reading the read-only CSR ``. + +|`misaligned` +|bool +|`False` +|If `True`, support misaligned memory accesses. + +|`mmu_mode` +|string +|`"sv39"` +|MMU mode of the core(s). + +|`mtvec_vectored_alignment` +|uint64_t +|`0x4` +|Default alignment of `mtvec` when using trap vector in vectored mode. + +|`mvendorid` +|uint64_t +|`0x00000602` +|Value to be returned when reading the read-only CSR ``. + +|`nmi_mcause` +|uint64_t +|`0x00000020` +|Value of `mcause` which represents a non-maskable interrupt (NMI), without the interrupt bit. Meaningful only if `interrupt_injection` == `True`. + +|`non_standard_interrupts` +|bool +|`False` +|Support non-standard interrupt notification scheme in which arbitraty patterns can be written into the lower 16 bits of `mie` and `mip` CSRs. + +|`override_custom_extensions` +|bool +|`True` +|Allow to override the presence of custom ISS extensions (see also `extensions` parameter above.) + +|`override_custom_extensions_value` +|bool +|`False` +|If `False`, pretend no custom extensions were specified. Used to disable all custom extensions at once. + +|`pmmpaddr0` +|uint64_t +|`0x0` +|Default value of `pmpaddr0` CSR + +|`pmpcfg0` +|uint64_t +|`0x0` +|Default value of `pmpcfg0` CSR + +|`pmpregions_max` +|uint64_t +|`0x0` +|Total count of implemented PMP regions in the core(s). The set of legal values is defined in RISC-V Privileged ISA specification: 0, 16 and 64. + +|`pmpregions_writable` +|uint64_t +|`0x0` +|Number of PMP regions with modifiable properties in the core(s), starting from region with index 0. + +|`priv` +|string +|`"MSU"` +|Supported privilege levels: "M", "MU" or "MSU" + +|trigger_count +|uint64_t +|`0x4` +|Number of supported triggers + +|unified_traps +|bool +|`False` +|Process all kinds of traps synchronously as if they were exceptions. +|=== + + +=== DPI Programming Interface + +The DPI interface is implemented in file `vendor/riscv/riscv-isa-sim/riscv/riscv_dpi.cc` in `core-v-verif` repository (https://github.com/openhwgroup/core-v-verif/). + +The shared library providing the implementation of the DPI functions to RTL simulators is named `libriscv.so`. It is built during Spike build and is installed in the `lib` subdirectory of Spike installation directory, typically `/tools/spike/lib`. + + +==== Create a new instance of Spike extended with DPI interfaces suitable for tandem operation + +[,verilog] +---- +import "DPI-C" function void spike_create(string filename); +---- + +[,c++] +---- +extern "C" void spike_create(const char *filename); +---- + +Input args: + +* `filename`: name of the ELF file to load into Spike memory. + +Return value: NONE. + +The instance of Spike will be available in global variable `Simulation *sim`. + +=== Destroy an existing instance of Spike with DPI interfaces + +[,verilog] +---- +import "DPI-C" function void spike_delete(); +---- + +[,c++] +---- +extern "C" void spike_delete(); +---- + +Input args: NONE. + +Return value: NONE. + +Deletes the instance of Spike pointed to by global variable `Simulation *sim`. + +=== Step Spike execution through the DPI interface with RVFI state represented as vectors of logic signals + +[,verilog] +---- +import "DPI-C" function void spike_step_svLogic(inout vector_rvfi core, inout vector_rvfi reference_model); +---- + +[,c++] +---- +extern "C" void spike_step_svLogic(svLogicVecVal* reference, svLogicVecVal* spike); +---- + +Input args: + +* `core`: (mutable) RVFI state of the RTL core represented as signal vector. +* `reference_model`: (mutable) RVFI state of the reference model represented as signal vector. + +Return value: NONE. + +Before stepping, the entry state of the reference model will be adjusted according to the state of the RTL model. After the step completes, the return state will represent the result of stepping the reference model for one commit (or exception) from the *updated* input state. + +==== Set Boolean parameter `` to `` + +[,verilog] +---- +import "DPI-C" function void spike_set_param_bool(string base, string name, bit value); +---- + +[,c++] +---- +extern "C" void spike_set_param_bool(const char *base, const char *name, bool value); +---- + +Input args: + +* `base`: Base part of parameter name (up to last forward slash character, inclusive) +* `name`: Last member of parameter name (past last forward slash character) +* `value`: Truth value representable on a single bit. + +Return value: NONE. + +==== Set uint64_t parameter `` to `` + +[,verilog] +---- +import "DPI-C" function void spike_set_param_uint64_t(string base, string name, longint unsigned value); +---- + +[,c] +---- +extern "C" void spike_set_param_uint64_t(const char *base, const char *name, uint64_t value); +---- + +Input args: + +* `base`: Base part of parameter name (up to last forward slash character, inclusive) +* `name`: Last member of parameter name (past last forward slash character) +* `value`: 64-bit unsigned integer value. + +Return value: NONE. + +==== Set string parameter `` to `` + +[,verilog] +---- +import "DPI-C" function void spike_set_param_str(string base, string name, string value); +---- + +[,c++] +---- +extern "C" void spike_set_param_str(const char *base, const char *name, string value); +---- + +Input args: + +* `base`: Base part of parameter name (up to last forward slash character, inclusive) +* `name`: Last member of parameter name (past last forward slash character) +* `value`: String value. + +Return value: NONE. + +==== Return the value of Boolean parameter `` + +[,verilog] +---- +import "DPI-C" function bit spike_get_param_bool(string base, string name); +---- + +[,c++] +---- +extern "C" bit spike_get_param_bool(const char *base, const char *name); +---- + +Input args: + +* `base`: Base part of parameter name (up to last forward slash character, inclusive) +* `name`: Last member of parameter name (past last forward slash character) + +Return value: Current value of Boolean parameter ``. + +==== Return the value of uint64_t parameter `` + +[,verilog] +---- +import "DPI-C" function longint unsigned spike_get_param_uint64_t(string base, string name); +---- + +[,c++] +---- +extern "C" uint64_t spike_get_param_uint64_t(const char *base, const char *name); +---- + +Input args: + +* `base`: Base part of parameter name (up to last forward slash character, inclusive) +* `name`: Last member of parameter name (past last forward slash character) + +Return value: Current value of uint64_t parameter `` + +==== Return the value of string parameter `` + +[,verilog] +---- +import "DPI-C" function string spike_get_param_str(string base, string name); +---- + +[,c++] +---- +extern "C" string spike_get_param_str(const char *base, const char *name); +---- + +Input args: + +* `base`: Base part of parameter name (up to last forward slash character, inclusive) +* `name`: Last member of parameter name (past last forward slash character) + +Return value: Current value of string parameter ``. + +==== Set a collection of parameters from a Yaml configuration file + +[,verilog] +---- +import "DPI-C" function void spike_set_params_from_file(string paramFilePath); +---- + +[,c++] +---- +extern "C" void spike_set_params_from_file(const char *paramFilePath); +---- + +Sets parameters of the current Spike instance according to the content of file named ``. + +==== Get current value of a Spike CSR of a specific core + +[,verilog] +---- +import "DPI-C" function void spike_get_csr(input longint unsigned proc_id, input longint unsigned csr_addr, inout longint unsigned value); +---- + +[,c++] +---- +extern "C" void spike_get_csr(uint64_t proc_id, uint64_t csr_addr, uint64_t &value); +---- + +Input args: + +* `proc_id`: ID of the core to be queried +* `csr_addr`: Address of the CSR +* `value`: Upon return from the function holds the value of the requested CSR. + +Return value: NONE (CSR value is passed in an in-out arg). + +==== Set value of a Spike CSR for a specific core + +[,verilog] +---- +import "DPI-C" function void spike_put_csr(input longint unsigned proc_id, input longint unsigned csr_addr, input longint unsigned value); +---- + +[,c++] +---- +extern "C" void spike_put_csr(uint64_t proc_id, uint64_t csr_addr, uint64_t value); +---- + +Input args: + +* `proc_id`: ID of the core to be queried +* `csr_addr`: Address of the CSR +* `value`: Value to be placed in the requested CSR. + +Return value: NONE. + + +//=== C++ Programming Interface + +=== Command-line interface + +Command-line options added to Spike serve a dual purpose: + +* run Spike in standalone mode using the same configurations as in tandem mode; +* (development aid) test the effect of individual parameters without editing configuration files. + +==== Load Spike parameters from a configuration file + +[,sh] +---- +--param-file +---- + +Load the parameter tree from Yaml file named ``. + +The file is expected to contain a valid parameter tree. Parameters which are not recognized will be silently ignored. + +==== Set a single Spike parameter + +[,sh] +---- +--param := +---- + +Set parameter `` of type `` to value ``. + +`` must be a full hierarchical parameter name starting with `/top/`. + +`` must be one of `bool`, `uint64_t` or `string`. + +``. Boolean values can be represented as numbers `0` and `1`, or symbols `True`, `true`, `False` or `false`. + +Integer values will be converted automatically according to the C/C++ notation rules: numbers starting wit `0x` are treated as hexadecimal, numbers starting with a leading `0` followed by digits are treated as octal, and all other numbers are assumed in decimal notation. diff --git a/docs/04_cv32a65x/tristan/tandem-verification/tandem.adoc b/docs/04_cv32a65x/tristan/tandem-verification/tandem.adoc new file mode 100644 index 0000000000..46a139f5d2 --- /dev/null +++ b/docs/04_cv32a65x/tristan/tandem-verification/tandem.adoc @@ -0,0 +1,321 @@ += Tandem-based verification +:toc: + +This document describes the OpenHW Group functional verification environment based on lockstep ("tandem") simulation of a RISC-V reference model and an RTL implementation of a RISC-V core. + +It is divided into two parts: + +* a User Manual which discusses the principle, the architecture and the usage of the environment, and outlines future work; +* a Reference Manual which provides a reference of configuration parameters and programming interfaces to support hardware designers, software developers, and verification engineers. + +== User Manual + +=== Scope of verification + +The verification is performed on the CVA6 core, with the exclusion of interrupt controllers, platform-related mechanisms (real-time timer), peripherals etc. + +The behaviors that are verified are: + +* instruction completion +* general-purpose register value changes +* CSR value changes +* memory operations (as seen by the CVA6 core) +* raising of exceptions +* raising of interrupts +* trap entry and return + +=== Basic Architecture and Design Choices + +Tandem-based functional verification of an RTL design consists in a lockstep comparison of the behavior of an RTL implementation against the behavior of a reference model (usually an Instruction Set Simulator, or ISS), both executing the same programs from the same initial state. In opposition to post-execution trace comparison, the tandem approach enables immediate detection of differences in observable behavior of the RTL implementation and the reference model, thus greatly simplifying the investigation of the root causes of any discrepancies. + +The OpenHW Group tandem verification environment relies on the use of RVFI (RISC-V Formal Interface, https://github.com/SymbioticEDA/riscv-formal/blob/master/docs/rvfi.md). RVFI provides a well-defined abstraction of design state at each instruction commit or exception occurrence. + +RVFI events are collected from both RTL and reference model side, and the RVFI states of both simulations are compared after every step. + +The reference model is a slave to the RTL model; it performs an execution step only when the RTL model has completed the current step. This enables partial verification of behaviors involving values that depend on micro-architecture features, e.g., clock cycle counts. The RTL provides the actual values of such registers to the reference model; these values are trusted by the reference model and hence, the reads of CSR registers yield the same results in the reference model as they do in the RTL simulation. + +The RTL simulation environment provides the testbench and the required abstractions of platform-level resources and behaviors. To match the features of the RTL environment, the reference model needs to include a basic platform abstraction that at the very minimum contains a memory model. For this reason Spike is a prime choice as it contains a superset of the minimum platform needed, cf. xref:fig-spike-platform[xrefstyle=short]. + +.High-level view of the platform supported by Spike +[#fig-spike-platform] +image::figures/Spike-simulation-scope.svg[High-level view of Spike platform,1024] + +In order to ease the integration with RTL simulators, the tandem infrastructure does not depend on the availability of a fully-featured UVM implementation. Instead, it uses a limited subset of UVM to provide a uniform SystemVerilog interface across all supported RTL simulators, whether commercial or open-source. + +==== Operation + +The principle of operation of CVA6 tandem verification is shown in xref:fig-overall-tandem-flow[xrefstyle=short]. The execution of the tandem simulation is controlled by the main RTL simulation loop that is managed either by the "run" phase of the UVM testbench, or explicitly in a test harness. The RTL simulation runs through successive clock cycles until the RVFI state of the RTL model is marked as valid, indicating an instruction commit or an exception in the current RTL clock cycle. + +.Overall tandem verification flow +[#fig-overall-tandem-flow] +image::figures/overall-tandem-architecture.svg[Tandem verification flow,1024] + +In the CVA6 RTL implementation multiple commits or exceptions can occur in a single cycle. If multiple simultaneous events are reported by RVFI, the corresponding entries in the RFVI interface are always ordered in the sequential execution order. For each commit or exception reported by the RVFI interface of the RTL model, the tandem control loop performs a single step of the reference model and compares the RVFI state between the current RTL-side RVFI entry and the reference model RVFI output. Per-commit entries in the RFVI interface are ordered in the sequential execution order, + +Interrupt and debug events detected by the RTL model are injected into the reference model at the beginning of each step. Likewise, the most recent *previously* committed values of microarchitecture-dependent CSRs such as clock cycle counters (`cycle`/`cycleh`, `mcycle`/`mcycleh`) are injected before the reference model step since the RTL values of these registers already correspond to RTL state *after* the corresponding RTL event. + +//.Stages of a single reference model step +//[#fig-refmodel-step] +//image::figures/refmodel-step-stages.svg[Stages of a reference model step,800,opts=inline] + +If a discrepancy in RFVI state is detected, an appropriate human-readable error output is written into the log file and the error count of the simulation is incremented. The tandem simulation stops once a preset number of errors is encountered. By controlling the error count limit is it possible to accommodate transient differences between the RTL and reference model states. A typical example of transient difference occurs when two instructions are committed in a single cycle by a superscalar RTL implementation, causing the counter of retired instructions `minstret` to be atomically incremented by two whereas the reference model will always commit instruction one by one, thus incrementing `minstret` by one each time. + +At the end of a tandem simulation a Yaml report file is produced. It contains information about the test setup (architecture, ISA, test name, etc.) and the list of mismatches found if any. + +For the purpose of CVA6 core verification the core-level interrupt controller (CLIC) is **not** modeled; instead, interrupt management is performed by the testbench. The RTL core model receives interrupt notifications from the testbench through the available interrupt input ports. When the RTL model of the core detects an interrupt, the information about the interrupt is passed to the reference model by reconstructing the value of the `mip` (Machine Interrupt Pending) CSR from the value of the `mcause` register to ensure that the interrupt taken is the same as the one already taken by the RTL simulation. The actual RTL value of the `mip` register may contain additional bits which are set; it will be injected into the reference model state at the end of the reference model step. + +==== Implementation + +The RTL testbenches use DPI functions to control Spike stepping, inject CSR values and query Spike's RVFI interface. + +The selection of single/tandem operation mode is achieved using the SystemVerilog macro `SPIKE_TANDEM`. If defined to a non-zero ("true") value, it enables the tandem configuration in the RTL simulation by instantiating and connecting the Spike wrapper, and activates the stepping and RVFI compare functions. + +The selection of file locations for the Yaml configuration and the Yaml report file locations is performed using dedicated "plusarg" command-line options added to the testbenches: + +* `+config_file=` sets the name of the Yaml configuration file to `` +* `+report_file=` sets the name of the Yaml tandem report file to ``. + +Option `+config_file` should only be used when an appropriate configuration file exists, and should be omitted otherwise. + +SystemVerilog code of the tandem infrastructure is maintained in OpenHW Group repositiory `core-v-verif` (https://github.com/openhwgroup/core-v-verif) which also contains the UVM components and agents that leverage the pure SystemVerilog layer. + +A pure SystemVerilog tandem testbench for CVA6 is available in the OpenHW Group CVA6 repository (https://github.com/openhwgroup/cva6/). + +To limit the impact on the Spike original code base, Spike modifications rely on "shim" (adapter) classes that add extra functionality in two ways: + +* by adding a new base class below the original base class (used for CSRs) +* by specializing a class to provide additional interfaces (Processor and Simulation) + +Spike execution model was modified to run a single-instruction step function instead of the performance-optimized "instruction batch" mode that executes hundreds to thousands of instructions without yielding control to the environment of the simulated design. + +In order to provide fine-grained control over the configuration of the simulated design, Spike was extended with a parameter mechanism with the following characteristics: + +* multiple ways of controlling Spike simulator behavior: +** configuration files (Yaml) +** DPI API +** C++ API +** additional command-line options +* ability to set platform-related values (memory map, boot addresses) +* ability to set implementation-defined values (number of PMP regions in total, vendor ID, architecture ID, implementation ID, ...) +* extended CSR control: +** availability or not of any CSR +** CSR initialization masks and values +** CSR write-enable masks with bit granularity +* ability to inject CSR values from the RTL model for architecture-dependent CSRs (`cycles`/`cyclesh`, `mcycles`/`mcyclesh`, `mip` etc.) + +Spike modifications are maintained in a forked Spike tree integrated into the OpenHW `core-v-verif` repository (https://github.com/openhwgroup/core-v-verif/). + +=== Fundamental limitations of the tandem verification approach + +* By definition, a reference model based on an instruction set simulator has only the notion of instruction commits and is not capable of representing the behaviors between commits (transient signals, micro-architectural artefacts such as stalls etc.) ++ +Such behaviors must be verified by other means. + +* Simultaneous occurrence of the same error in both reference model and RTL implementation will not be detected since the two behaviors will match. ++ +This stresses the importance of validation of the reference model against the specification. + +* Since the reference model for tandem verification does not include an interrupt controller, the presence of pending interrupts must be notified to the `mip` register by other means. ++ +In the OpenHW Group solution, the presence of the interrupt is detected by the RTL model and the information about the actual interrupt raised as the result of interrupt arbitration is passed to the reference model in the `mcause` register. The value in `mcause` is then used to trigger the corresponding event in the reference model. + +* Transient discrepancies can be caused by differences between purely sequential and concurrent model execution. ++ +Spike executes instructions in sequence whereas a superscalar architecture may commit multiple instructions in a single cycle; this means that some CSR updates may differ between Spike and RTL, typically MINSTRET may advance by 2 or more in the RTL model, yet Spike will increment it by 1 at every instruction committed. + +=== QuickStart Guide + +==== Build instructions + +The standard build procedure of Spike supplied with the CVA6 project (https://github.com/openhwgroup/cva6/) builds a Spike binary with all necessary modifications required for DPI interfacing to the RTL testbench. + +Building Spike with tandem extensions requires the availability of `CMake` in version 3.16 or higher. The `yaml-cpp` source package (https://github.com/jbeder/yaml-cpp) that is used to parse and manipulate Yaml configurations is a submodule of Spike and will be downloaded when performing a recursive clone or update of the CVA6 repository. + +If a rebuild is required in an existing setup, it is necessary to either select a different installation directory for Spike or to remove the current installation directory of Spike. It is also necessary to remove Spike build directories `verif/core-v-verif/vendor/riscv/riscv-isa-sim/build` (Spike proper) and `verif/core-v-verif/vendor/riscv/riscv-isa-sim/yaml-cpp/build` (`yaml-cpp` library). + +==== Understanding the Spike configuration file + +Spike configuration files represent a Spike parameter tree in the form of nested Yaml dictionaries under a common root node `spike_param_tree`: + +[,yaml] +---- + spike_param_tree: + bootrom_enable: true + bootrom_base: 0x10000 + bootrom_size: 0x1000 + dram_enable: true + dram_base: 0x80000000 + dram_size: 0x40000000 + core_configs: + - + isa: rv32imczicsr_zcb_zba_zbb_zbc_zbs + priv: M + extensions: cv32a60x,cvxif + boot_addr: 0x80000000 + marchid_override_mask: 0xFFFFFFFF + marchid_override_value: 0x3 + misa_write_mask: 0x0 +---- + +The first-level entries define platform properties (`bootrom_*` and `dram_*`) and the default settings for all cores of the system (`core_configs`). + +The configuration fragment above instructs the reference model that: + +* the platform contains a boot ROM of size 4 KiB starting at address 0x10000 (0+64 KiB); +* the platform contains a DRAM memory of 1 GiB starting at the address 0x80000000 (0+2 GiB) +* all cores: +** implement an RV32IMC ISA with extensions `Zicsr`, `Zcb`, `Zba`, `Zbb`, `Zbc` and `Zbs`; +** support only the Machine privilege level; +** implement additional features modeled in Spike custom extensions `cv32a60x` (additional CSRs specific to `CV32A6*X` family of cores) and `cvxif` (the CV-X-IF interface); +** boot from address `0x80000000` (the start address of the DRAM memory); +** force the reset value of register `marchid` to value `0x3` (corresponding to the CVA6 architecture); +** discard writes into `misa` CSR register by marking all its bits as non-mutable. + +==== Running simulations in tandem mode + +Currently, tandem simulations are supported out-of-the-box only for CVA6 target `cv32a65x` using RTL simulator configurations `vcs-uvm` (recommended), `vcs-testharness`, `questa-uvm` and `questa-testharness`. Tandem simulations of target `cv32a65x` with Verilator using the `veri-testharness` configuration require Verilator v5.016 or higher. + +Tandem verification is enabled by setting environment variable `SPIKE_TANDEM` to a non-empty value prior to invoking any of the test scripts located in the `verif/regress` directory of the CVA6 source tree. + +**Example:** +[,sh] +---- +export SPIKE_TANDEM=1 ; bash verif/regress/dv-csr-embedded-tests.sh +---- + +The output of tandem simulation is sent to the log file of the RTL simulator, by default `verif/sim/out_/_sim/..log.iss`. + +A machine-processable Yaml report file summarizing the mismatches detected during the simulation is stored in a separate report file, by default `verif/sim/out_/_sim/..log.yaml`. It contains the configuration of the simulated design, the number of mismatches that were found (if any) and a list of entries describing the successive mismatches. + +=== How to analyze tandem verification results + +To simplify failure analysis, the RVFI state comparator reports mismatches between commit states of the reference model and the RTL model at all verbosity levels. However, it produces no output if the commit states are identical and the verbosioty level is `UVM_LOW`. By controlling output verbosity, the simulation log can be reduced to the sole list of state discrepancies, or it can contain a complete log of commits and exceptions to provide additional context at failure locations. + +A scoreboard discrepancy is represented by a line describing the nature of the error, followed by two lines that provide a summary view of the RVFI state of the reference model and the RTL core. + +**Example:** +[,text] +---- +CSR 304 Mismatch [REF]: 0x0 [CORE]: 0x8 +UVM_INFO @ 45992.000 ns : uvmc_rvfi_scoreboard_utils.sv(206) reporter [spike_tandem] 45992.000 ns | RVFI | 0 | 0 | 800036ac | 30419073 | M | x3 | 00000008 | x0 | 0000000000000000 | csrrw zero, mie, gp +UVM_ERROR @ 45992.000 ns : uvmc_rvfi_scoreboard_utils.sv(211) reporter [spike_tandem] 45992.000 ns | RVFI | 0 | 0 | 800036ac | 30419073 | M | x3 | 00000008 | x0 | 0000000000000000 | csrrw zero, mie, gp <- CORE +---- + +The first line indicates that for current instruction commit the content of the CSR with hexadecimal address `304`, i.e., `mie` ("Machine Interrupt Enable") differs between the reference model (value `0x0`) and the RTL core model (value `0x8`). The next two lines provide details about the current instruction in both models: + +* the current time stamp: `45992.000 ns`, +* the value of PC: `0x800036ac`, +* the encoding of the instruction: hex `30419073`, +* the current privilege mode: `M` (Machine), +* the values of general-purpose register operands of the instruction: `x3` (`gp` in RISC-V ABI) equals `0x00000008`, `x0` equals `0x00000000`. + +Since the values stored into the `mie` CSR differ whereas the instruction executed is the same including the values of its input operands, the discrepancy can only come from the write behavior of the `mie` CSR. In this specific case, bit 3 of the `mie` register is specified as "read-only zero" in the reference model, yet the RTL implementation accepts writes into this specific bit of `mie`. + +The following is the corresponding fragment from the Yaml report, including the configuration and status summary of the simulation run: + +[,yaml] +---- +csrs_match_count: 461 +exit_cause: MISMATCH +exit_code: 0 +instr_count: 3280 +isa: rv32imc_zba_zbb_zbs_zbc_zicsr_zifencei +mismatch_description: 'CSR 304 Mismatch [REF]: 0x880 [CORE]: 0x888 ' +mismatches: +- 0: null + core: + insn: 0000000030419073 + insn_disasm: csrrw zero, mie, gp + mode: 3 + pc_rdata: 00000000800036ac + pc_wdata: 0 + rd1_addr: 0 + rd1_rdata: 0 + rs1_addr: 3 + rs1_rdata: 0000000000000008 + trap: 0 + reference_model: + insn: 0000000030419073 + insn_disasm: csrrw zero, mie, gp + mode: 3 + pc_rdata: 00000000800036ac + pc_wdata: 0 + rd1_addr: 0 + rd1_rdata: 0 + rs1_addr: 3 + rs1_rdata: 0000000000000008 + trap: 0 +[...] +mismatches_count: 5 +simulator: vcs-uvm +target: cv32a65x +test: csr_test +testlist: csr_embedded +---- + +=== Example uses of Yaml reference model parameters + +==== Enable (make "accessible") or disable (make "inaccessible") a CSR + +A Boolean parameter consisting of CSR name and suffix `_accessible`, placed in generic or per-core configuration, indicates whether the given register should be accessible or not. + +**Example**: + +[,yaml] +---- +spike_param_tree: + ... + core_configs: + - + ... + tinfo_accessible: False +---- + +makes the `tinfo` CSR inaccessible in the reference model for all cores, meaning that any attempt to access (read or write) the `tinfo` register when executing an instruction in the reference model will trigger an illegal instruction exception. + +==== Force specific bits in a CSR to a constant value + +The reference model parameters provide two levels of control over the content of CSRs: + +* override the reset value of selected bits of any CSR; +* define which bits of a CSR cannot be modified. + +Assuming that bit 5 of a certain CSR `` should always be set irrespective of the default setting in the reference model, it is necessary to indicate that: + +* the reset value of that bit should be 1; +* writes to that bit should be ignored. + +The parameters needed to express this property are: + +* `_override_mask` which defines which bits of `` should be forced to a specific value at reset, +* `_override_value` which indicates the reset value of the forced bits of ``, +* `_write_mask` which indicates which bits of `` are writable (mask bit is set) and which are not (mask bit is cleared). + +Assuming that the XLEN is 32 and only bit 5 of `` should be fixed, the Yaml settings to use are: + +[,yaml] +---- +spike_param_tree: + ... + core_configs: + - + ... + _override_mask: 0x00000020 + _override_value: 0x00000020 + _write_mask: 0xffffffdf +---- + +=== Integration with the RISC-V Open Source ecosystem + +* The entire tandem verification infrastructure is available as Open Source under SHL 2.0+ (an Apache-type license) in OpenHW Group repositories `cva6` (https://github.com/openhwgroup/cva6/) and `core-v-verif` (https://github.com/openhwgroup/cva6/). + +* The OpenHW Group tandem verification infrastructure relies on Spike (https://github.com/riscv-software-src/riscv-isa-sim/) as reference model. ++ +The tandem verification approach is a general template and is not bound to a single ISS or RTL simulator. However, it requires that the ISS provides a basic platform capable of running software matching the capabilities of the RTL testbench, including a model of memory and buses. +Because of this constraint the current tandem framework relies on Spike which is fully Open Source and provides all necessary components. + +* Verilator-based tandem verification ++ +Starting with version 5.016, Verilator supports tandem simulations. Earlier versions did not provide sufficient support for structured types in the DPI interface. + +include::reference.adoc[] diff --git a/docs/04_cv32a65x/tristan/verif-spec/media/axiagentmerge.png b/docs/04_cv32a65x/tristan/verif-spec/media/axiagentmerge.png new file mode 100644 index 0000000000..0c52b5568c Binary files /dev/null and b/docs/04_cv32a65x/tristan/verif-spec/media/axiagentmerge.png differ diff --git a/docs/04_cv32a65x/tristan/verif-spec/media/frontendsb.png b/docs/04_cv32a65x/tristan/verif-spec/media/frontendsb.png new file mode 100644 index 0000000000..43c1a98a02 Binary files /dev/null and b/docs/04_cv32a65x/tristan/verif-spec/media/frontendsb.png differ diff --git a/docs/04_cv32a65x/tristan/verif-spec/media/funccovresults.png b/docs/04_cv32a65x/tristan/verif-spec/media/funccovresults.png new file mode 100644 index 0000000000..720ee3d0a6 Binary files /dev/null and b/docs/04_cv32a65x/tristan/verif-spec/media/funccovresults.png differ diff --git a/docs/04_cv32a65x/tristan/verif-spec/media/hvp.png b/docs/04_cv32a65x/tristan/verif-spec/media/hvp.png new file mode 100644 index 0000000000..a13c274696 Binary files /dev/null and b/docs/04_cv32a65x/tristan/verif-spec/media/hvp.png differ diff --git a/docs/04_cv32a65x/tristan/verif-spec/media/image1.png b/docs/04_cv32a65x/tristan/verif-spec/media/image1.png new file mode 100644 index 0000000000..3c33b78579 Binary files /dev/null and b/docs/04_cv32a65x/tristan/verif-spec/media/image1.png differ diff --git a/docs/04_cv32a65x/tristan/verif-spec/media/image2.png b/docs/04_cv32a65x/tristan/verif-spec/media/image2.png new file mode 100644 index 0000000000..35b78c7ee9 Binary files /dev/null and b/docs/04_cv32a65x/tristan/verif-spec/media/image2.png differ diff --git a/docs/04_cv32a65x/tristan/verif-spec/media/image3.png b/docs/04_cv32a65x/tristan/verif-spec/media/image3.png new file mode 100644 index 0000000000..678feee156 Binary files /dev/null and b/docs/04_cv32a65x/tristan/verif-spec/media/image3.png differ diff --git a/docs/04_cv32a65x/tristan/verif-spec/media/image4.png b/docs/04_cv32a65x/tristan/verif-spec/media/image4.png new file mode 100644 index 0000000000..46f2dc26ef Binary files /dev/null and b/docs/04_cv32a65x/tristan/verif-spec/media/image4.png differ diff --git a/docs/04_cv32a65x/tristan/verif-spec/media/image5.png b/docs/04_cv32a65x/tristan/verif-spec/media/image5.png new file mode 100644 index 0000000000..af9ceb2429 Binary files /dev/null and b/docs/04_cv32a65x/tristan/verif-spec/media/image5.png differ diff --git a/docs/04_cv32a65x/tristan/verif-spec/media/image6.png b/docs/04_cv32a65x/tristan/verif-spec/media/image6.png new file mode 100644 index 0000000000..1c06cbd65b Binary files /dev/null and b/docs/04_cv32a65x/tristan/verif-spec/media/image6.png differ diff --git a/docs/04_cv32a65x/tristan/verif-spec/media/image7.png b/docs/04_cv32a65x/tristan/verif-spec/media/image7.png new file mode 100644 index 0000000000..4bd0f1ba57 Binary files /dev/null and b/docs/04_cv32a65x/tristan/verif-spec/media/image7.png differ diff --git a/docs/04_cv32a65x/tristan/verif-spec/media/interrupt_uvm_agent.png b/docs/04_cv32a65x/tristan/verif-spec/media/interrupt_uvm_agent.png new file mode 100644 index 0000000000..03cb21e2ec Binary files /dev/null and b/docs/04_cv32a65x/tristan/verif-spec/media/interrupt_uvm_agent.png differ diff --git a/docs/04_cv32a65x/tristan/verif-spec/media/rtlparam.png b/docs/04_cv32a65x/tristan/verif-spec/media/rtlparam.png new file mode 100644 index 0000000000..6ed4a655c6 Binary files /dev/null and b/docs/04_cv32a65x/tristan/verif-spec/media/rtlparam.png differ diff --git a/docs/04_cv32a65x/tristan/verif-spec/media/rtlparamgates.png b/docs/04_cv32a65x/tristan/verif-spec/media/rtlparamgates.png new file mode 100644 index 0000000000..19ed388e42 Binary files /dev/null and b/docs/04_cv32a65x/tristan/verif-spec/media/rtlparamgates.png differ diff --git a/docs/04_cv32a65x/tristan/verif-spec/media/toggleresults.png b/docs/04_cv32a65x/tristan/verif-spec/media/toggleresults.png new file mode 100644 index 0000000000..95c8e8730b Binary files /dev/null and b/docs/04_cv32a65x/tristan/verif-spec/media/toggleresults.png differ diff --git a/docs/04_cv32a65x/tristan/verif-spec/verification_specifications.adoc b/docs/04_cv32a65x/tristan/verif-spec/verification_specifications.adoc new file mode 100644 index 0000000000..b7b59b27e1 --- /dev/null +++ b/docs/04_cv32a65x/tristan/verif-spec/verification_specifications.adoc @@ -0,0 +1,816 @@ +[.text-center] +*TRISTAN* + +[.text-center] +*Together for RISc-V Technology and ApplicatioNs* + +[.text-center] +image:./media/image1.png[./media/image1,width=273,height=273] + +[.text-center] +*Verification Specifications* + +*Project Website* www.tristan-project.eu + +*JU Grant Agreement Number* 101095947 + +[width="100%",cols="40%,60%",options="header",] + +|=== + +|image:./media/image2.png[./media/image2,width=172,height=81] |TRISTAN + +has received funding from the Key Digital Technologies Joint Undertaking +(KDT JU) under grant agreement nr. 101095947. The KDT JU receives support from the European Union’s Horizon Europe’s research and innovation programme and Austria, Belgium, Bulgaria, Croatia, Cyprus, Czechia, Germany, Denmark, Estonia, Greece, Spain, Finland, France, Hungary, Ireland, Israel, Iceland, Italy, Lithuania, Luxembourg, Latvia, Malta, Netherlands, Norway, Poland, Portugal, Romania, Sweden, Slovenia, Slovakia, Turkey + +|=== + +== Table of Contents + +. link:#_Introduction[Introduction] + +.. General Information + +.. Acronyms and Definitions + +. link:#_Verification_Strategy[Verification Strategy] + +.. Methodology + +.. Database Structure + +.. Tools + +.. Planned releases + +. link:#_Testbench[Testbench] + +.. Testbench Architecture + +.. Testbench Block-Diagram + +.. Testbench Components + +... Agents + +... UVCs + +... Checkers + +... Assertions + +... UVM Scoreboard + +... Coverage Model + +... Write or Generate Tests + +. link:#_Verification_Plan[Verification Plan] + +.. + +.. + +. link:#_Verification_Reports[Verification Reports] + +.. Regression Results + +.. Functional coverage + +.. Code coverage + +.. SpyGlass integration + +.. RTL issues detected + +[#_Introduction] +== Introduction +=== General Information + +This document describes the CVA6 verification strategy and implementation. + +=== Acronyms and Definitions + +[width="100%",cols="24%,76%",options="header",] + +|=== +|Acronym |Description +|TB |Testbench +|UVC |Universal verification component +|IF |SystemVerilog Interface +|CVXIF |CORE-V eXtension Interface +|UVM | Universal Verification Methodology +|RVFI | RISC-V Formal Interface +|DUT | Device Under Test +|SV | SystemVerilog +|VIF | Virtual InterFace +|AXI | Advanced eXtensible Interface +|=== + +[#_Verification_Strategy] +== Verification Strategy +=== Methodology + +This project is not a single verification environment that can support any-and-all CORE-V cores. Rather, it supports the verification of multiple cores by enabling the rapid creation of core-specific verification environments. There is no attempt to define a one-size-fits-all environment as these inevitably lead to either bloated code, needless complexity, or both. Instead, the idea is to create a toolkit allowing the rapid development of core-specific environments using a set of high-level reusable components and a standard UVM framework. + +UVM environments are often described as a hierarchy with the device-under-test (CVA6) at the bottom and testcases at the top. In between are various components with increasing degrees of abstraction as we go from the bottom levels (the register-transfer level) to the middle layers (transaction-level) to the top (tests). The lower layers of the environment see the least amount of re-use owing to the need to deal with core-specific issues. Components at this level are core-specific. At the transaction level there can be considerable amounts of re-use. For example, it is easy to imagine a single UVM RVFI Agent serving the needs of any and all CORE-V cores. The test level sees a mix of re-usable tests (e.g. RV32IMAC compliance) and core-specific tests (e.g. hardware loops in CV32A60X). + +The core-v-verif project exploits this idea to maximize re-use across multiple cores by striving to keep as much of the environment as possible independent of the core’s implementation. Components such as the instruction generator (RISCV-DV), reference model (Spike), CSR checkers can be made almost entirely independent of a specific core because they can be based on the ISA alone. Other components such as the functional coverage model, AXI & CVXIF Agents and the test-program environment can be implemented as a mix of re-usable components and core-specific components. + +Depending on the details of the top-level interfaces of individual cores, the lowest layers of this environment may not be re-usable at all. + +=== Database Structure + +The verification environment, built from the resources provided by core-v-verif project can be conceptually divided into four levels: Testbench Layer, Translation Layer, Abstraction Layer and Test Layer. Each of these will be discussed in turn. + +*_[.underline]#Testbench Layer:#_* + +The testbench layer is comprised of two SystemVerilog modules and several SystemVerilog interfaces. We will discuss the SystemVerilog interfaces first, as this will make it easier to understand the structure and purpose of the modules. + +*_[.underline]#SystemVerilog Interfaces:#_* + +The top-level ports of the core can be categorized as follows: + +* Instruction and Data memory interface(s) +* Clocks and Resets +* Configuration +* Trace +* Special Status and Control + +The Instruction and Data memory interface is listed first for a reason. This interface is generally the most core-specific. For example, CVA6 supports AXI-like Instruction and Data memory interfaces while other cores using core-v-verif project can support other interfaces. These are significant difference and so the Testbench Layer deliberately hides this interface from the higher-level layers. This is done in the “DUT Wrapper” module, see below. + +The remaining interface categories can be defined as generic collections of input or output signals whose operation can be defined by higher layers. A few examples should illustrate this point: + +Clocks and resets can be parameterized arrays of clock and reset signals. The upper layers of the environment will define the number of clocks and implement the appropriate frequency and phase relationships. Resets are managed in the same manner. + +*_[.underline]#Testbench Modules:#_* + +The two modules of the Testbench Layer are the “DUT Wrapper” and the “Testbench”. The purpose of the wrapper is to conceal as many core-specific physical attributes as possible. As hinted at above this is done by keeping control of the core’s memory interface(s) and mapping all other ports to one of the non-memory interface types. + +The wrapper instantiates a memory model that connects directly to the core’s instruction and data interface(s). This memory model also supports several memories mapped virtual peripherals. The core’s memory interface is not “seen” by any other part of the environment, so this interface (or these interfaces, as the case may be) can be completely different from other cores and the only part of the environment affected is the DUT wrapper, and its memory model. The address map of the modeled memory and peripherals is implemented to ensure compatibility with the test-program environment. + +The Testbench module is mostly boiler-plate code that does the following: + +* Instantiates the wrapper, +* Push handles of the SV interfaces to the UVM configuration database, +* Invoke run_test(), +* Implement a final code-block to display test pass/fail. + +The expectation is that the DUT Wrapper module will be core-specific and will need to be coded from scratch for each core. The Testbench module is also expected to be core-specific but can be easily created by copying and modifying a Testbench module from a previous generation. The SystemVerilog interfaces for Clocks and Resets, Configuration, ISACOV, RVFI, Trace, AXI, plus Special Status and Control are generic enough to be fully re-used. + +*_[.underline]#Repository Structure:#_* + +The top-level of the repository is specifically organized to support multiple verification environments. The directory structure below shows a version of the environment that supports multiple CORE-V cores. What follows is a brief description of the purpose of each top-level directory. Refer to the README files at each of these locations for additional information. If you read nothing else, please read verif/README.md. + +*verif*: This directory contains the CVA6 specific environment, testbench, tests and simulation directories. The common part is located in verif/core-v-verif directory. + +*verif/core-v-verif/lib*: This is where the bulk of the re-usable components and tests are maintained. This is where you will find the instruction generator, reference model, common functional coverage models, UVM Agents for clocks-and-resets, interrupts, status, etc. + +=== Tools + +==== + +In our verification environment we use Synopsys tool (VCS) to simulate, generate coverage also, the tool version is *_VCS 2023_* + +==== + +As RISC-V reference model, we use Spike in tandem mode with VCS to ensure the CVA6 behaves as required. + +=== Planned releases + +Provide details regarding verification milestones, starting from TB bring up till Code-Coverage complete. + +[#_Testbench] +== Testbench +=== Testbench Architecture + +This section describes the testbench of the CVA6 core. This environment + +is intended to be able to verify the CVA6 core and run different test + +cases by the minimal modification to the environment itself. + +image:./media/image3.png[./media/image3,width=596,height=296] + +*_[.underline]#uvmt_cva6_tb#_* + +In this module we instantiate the agent interfaces, the uvmt_cva6_dut_wrap module and assertions modules. We set the interfaces for using the configuration database set method. In this module we get the sim_finished database object set in the uvmt_cva6_base_test_c class. + +To check whether the simulation passed or failed we check the err_count and fatal_count along with sim_finished. + +*_[.underline]#uvmt_cva6_dut_wrap#_* + +In this module we instantiate the cva6_tb_wrapper module and we instantiate some unused outputs of CVXIF. + +*_[.underline]#cva6_tb_wrapper#_* + +In this module we instantiate the CVA6 core, and we connect it with all the interfaces of the cva6 environment verification. Also, in this module we instantiate an SRAM, an AXI interface and an AXI adapter that we can use if we don’t have an AXI agent or if we want disactivate the agent for performance reason. The switch between the SRAM and AXI agent is possible thanks to the AXI switch. + +=== Testbench Block-Diagram + +image:./media/image4.png[./media/image4,width=624,height=294] + +*_[.underline]#uvmt_cva6_base_test_c#_* + +This class extends from uvm_test. It randomise the uvmt_cva6_test_cfg_c and uvme_cva6_cfg_c objects. The class's build_phase, connect_phase functions handle the setup and configuration of the environment, connecting it to the CVA6, and executing the test. + +We start the uvme_cva6_reset_vseq_c sequence in the reset phase on the uvme_cva6_vsqr_c sequencer. + +*_[.underline]#uvme_cva6_env_c#_* + +This class extends from the uvm_env class. The class's build_phase, connect_phase functions handle the setup and configuration + +of all the agents. The class's run_phase task start the sequence of the active agents on their sequencers. We get the configuration and context information for the environment in this class using the get method. + +[.underline]#Environment component# + +There are two types of uvm_component that are uvm agent and uvm coverage model. + +[.underline]#Environment_objects# + +There are two uvm_objects that are uvme_cva6_cfg_c and uvme_cva6_cntxt_c. The Objects contain configuration and context information for the environment. + +*_[.underline]#uvme_cva6_cfg_c#_* + +This class extends from the uvm_object class. The object encapsulates all parameters for creating, connecting and running CVA6 environment (uvme_cva6_env_c) components. This class also includes a constraint block that defines default values for some of its fields and other constraints on its fields such as, enabled and is_active fields are set to 0 and 'UVM_PASSIVE' respectively by default. + +*_[.underline]#uvme_cva6_cntxt_c#_* + +The class uvme_cva6_cntxt_c is an object that encapsulates all state variables for CVA6 environment (uvme_cva6_env_c) components. It inherits from the uvm_object base class.It also contains two events, sample_cfg_e and sample_cntxt_e, that can be used to synchronize the sampling of configuration and context information. + +*_[.underline]#uvme_cva6_vsqr_c#_* + +This class extends from the uvm_sequencer base class. It also has sequencer handles of all the active agent. This class is used to start the virtual sequence. + +*_[.underline]#uvme_cva6_reset_vseq_c#_* + +This class uvme_cva6_reset_vseq_c extends a class called uvme_cva6_base_vseq_c. The purpose of this sequence is to start the system clock and issue the initial reset pulse to the Device Under Test (DUT).The class has a default constructor and a virtual task called "body" which is responsible for starting the clock, waiting for a specified amount of time, and then resetting the DUT. + +=== Testbench Components + +==== Agents + +*_[.underline]#Clock & Reset Agent#_* + +This agent controls the clock and reset signal of the CVA6 core. + +[.underline]#uvma_clknrst_if:# + +The uvma_clknrst_if interface has two logic signals, clk and reset_n. The clk signal represents the system clock, while the reset_n signal is the active-low reset signal.The interface includes an initial block that contains a forever loop that generates the clock signal, based on the value of clk_active and clk_period. If clk_active is set to 1 and clk_period is 0, the function will raise a fatal error. The interface also includes three functions: set_period, which sets the value of clk_period; start_clk, which sets clk_active to 1; and stop_clk, which sets clk_active to 0. + +[width="100%",cols="<37%,<63%",options="header",] + +|=== + +|*signal* |*Description* + +|clk |Controls the Clock fed to the design under test. + +|reset_n |Control the reset state of the design under test. + +|=== + +[.underline]#uvma_clknrst_uvm_objects:# + +The uvm_objects uvma_clknrst_cfg_c , uvma_clknrst_cntxt_c contain the configuration and context information of the uvma_clknrst_agent. + +[.underline]#uvma_clknrst_seq_item_c:# + +The class represents an object created by Clock & Reset agent sequences that extend the uvma_clknrst_seq_base_c class. + +The class contains several randomized variables: + +* action is an enumerated variable of type "uvma_clknrst_seq_item_action_enum" that represents the operation to perform (e.g. start clock, stop clock, assert reset, de-assert reset). + +* initial_value is an enumerated variable of type "uvma_clknrst_seq_item_initial_value_enum" that represents the initial value of the signals (if starting or asserting). + +* clk_period is an unsigned 32-bit integer variable representing the period of the clock signal. + +* rst_deassert_period is an unsigned 32-bit integer variable representing the amount of time (in picoseconds) after which to de-assert reset. + +* The class also includes a constraint "default_cons" which sets the default values for clk_period to 0 and rst_deassert_period to a value defined by uvma_clknrst_default_rst_deassert_period. + +The class has a default constructor which calls the superclass constructor. + +[.underline]#uvma_clknrst_Sequence:# + +It consists of two main sequences: uvma_clknrst_stop_clk_seq_c and uvma_clknrst_restart_clk_seq_c. + +* The uvma_clknrst_stop_clk_seq_c creates an instance of the uvma_clknrst_seq_item_c and set its action to the UVMA_CLKNRST_SEQ_ITEM_ACTION_STOP_CLK and start and finish the item. + +* The uvma_clknrst_restart_clk_seq_c creates an instance of the uvma_clknrst_seq_item_c and set the its action to the UVMA_CLKNRST_SEQ_ITEM_ACTION_RESTART_CLK and start and finish the item. + +[.underline]#uvma_clknrst_drv_c:# + +This class uvma_clknrst_drv_c is used for driving the interface of the clknrst agent. It get reqs from the sequence item port and calls the drv_req task. The drv_req task drives the virtual interface's (cntxt.vif) signals using req's contents. And then call the write method for the analysis port to send the req transaction to the coverage model. + +[.underline]#uvma_clknrst_mon_c:# + +This class uvma_clknrst_mon_c is used for monitoring the virtual interface of the Clock & Reset agent. The class extends the uvm_monitor class and contains objects for configuration (cfg) and context (cntxt), as well as an analysis port (ap) for transaction analysis.The run_phase() task in the uvma_clknrst_mon_c class is responsible for overseeing the monitoring process of the Clock and Reset virtual interface. It does this by executing the monitor_clk() and monitor_reset() tasks in parallel forks. + +[.underline]#uvma_clknrst_cov_model_c:# + +This class uvma_clknrst_cov_model_c extends from the uvm_component base class. The overall functionality of this class is to provide the coverage model for the clknrst_agent. It contains objects for configuration, context, monitor transaction, and sequence item, as well as two analysis FIFOs for holding transactions coming from the monitor and sequence item respectively. This section is in progress. + +[.underline]#uvma_clknrst_agent_c:# + +This class uvma_clknrst_agent_c extends from uvm_agent base class. This class encapsulates, builds and connects all the other components for driving and monitoring a Clock & reset interface. This class gets the cfg , cntxt using configuration database get method. It creates a driver,monitor,cov_model and sequencer. This class connects the driver with a sequencer. + +*_[.underline]#Cvxif Agent#_* + +Cv-xif agent supports custom instructions. Upon receiving the issue request it drives the response one clock cycle after the issue request. + +[.underline]#uvma_cvxif_intf:# + +The interface includes inputs for clock and reset_n signal, as well as two data input/output called cvxif_req_i and cvxif_resp_o. It includes a clocking block for the monitor monitor_cb to sample the cvxif_req_i and cvxif_resp_o signal at the rising edge of the clock. + +[width="100%",cols="<35%,<65%",options="header",] + +|=== + +|*Enum Variable* |*Description* + +|Cvxif_req_i |The request is sent to get a response + +|Cvxif_resp_o |The response is generated according to the request. + +|=== + +[.underline]#uvma_cvxif_uvm_objects:# + +There are two uvm_objects uvma_cvxif_cfg_c and uvma_cvxif_cntxt_c. uvma_cvxif_cfg_c encapsulates all the parameters for creating, connecting and running the uvma_cvxif_agent_c agent. uvma_cvxif_cntxt_c confines all the state variables for all the CVXIF agent components. + +[.underline]#uvma_cvxif_Sequence_items:# + +Cvxif agent has two sequence items one uvma_cvxif_req_item_cand uvma_cvxif_resp_item_c for the request and response transaction. + +[.underline]#uvma_cvxif_sqr_c:# + +uvma_cvxif_sqr_c class extends from uvm_sequencer base class. It is a typical sequencer. This class instantiates a FIFO to receive the uvma_cvxif_req_item_c. + +[.underline]#uvma_cvxif_sequences:# + +* uvma_cvxif_base_seq_c class extends from uvm_sequence . This class simply implements a decode function that checks whether the instructions are legal or illegal. + +* uvma_cvxif_seq_c class extends from uvma_cvxif_base_seq_c class. This class gets the uvma_cvxif_req_item_c from the FIFO in sequencer using the p_sequencer handle. In this sequence class, we send the response according to the request item received. If we receive an instruction from the req_item that is illegal, then we drive zeros on the response signals. Otherwise, we drive response accordingly. + +[.underline]#uvma_cvxif_drv_c:# + +This class uvma_cvxif_drv_c extends from the uvm_driver class. This class has several tasks that perform different actions such as generating a random ready signal, getting response_item, driving an issue response to the VIF, driving results in order and out of order fashion, and de-asserting signals. + +[.underline]#uvma_cvxif_mon_c:# + +uvma_cvxif_mon_c class extends from the uvm_monitor . It monitors the virtual interface vif. It monitors transaction requests and responses and sends transaction requests to uvma_cvxif_sqr_c and responses to the coverage model. It has several fields, including objects for configuration and context, and analysis ports for transaction requests and responses. + +[.underline]#uvma_cvxif_cov_model_c:# + +uvma_cvxif_cov_model_c is derived from the uvm_component class. This class defines various objects and covergroups with different coverpoints, and it also uses the UVM library to sample these coverpoints and measure coverage.The main purpose of this class is to measure the functional coverage of a specific interface in the design and ensure that it has been fully tested. + +[.underline]#uvma_cvxif_agent_c:# + +uvma_cvxif_agent_c class extends from uvm_agent class. This class represents an agent that is responsible for the test execution and communication between the virtual interface (VIF) and the testbench components. The main role of this class is to create and connect the different components of the testbench and manage the communication between them and the virtual interface (VIF) during the test execution. + +*_[.underline]#RVFI Agent#_* + +The rvfi agent is a passive agent responsible for monitoring the rvfi tracing interface. It compares the transactions made with the core and reference model values and outputs the committed instructions to a file. + +[.underline]#uvma_rvfi_if# + +The rvfi interface of the testbench follows the rvfi specification. It mainly outputs the GPR (General Purpose Registers), FPR (Floating Point Registers), and CSRs (Control and Status Registers). + +[.underline]#uvma_rvfi_instr_mon_c# + +This class is used to monitor the RVFI interface. It produces rvfi transactions that will be broadcasted to the uvma_rvfi_mon_trn_logger_c, uvma_rvfi_scoreboard_c, and uvma_rvfi_reference_model_monitor. + +[.underline]#uvma_rvfi_mon_trn_logger_c# + +This class uses the produced transactions from uvma_rvfi_instr_mon_c to write a trace file containing all the committed instructions of the core. + +[.underline]#uvma_rvfi_reference_model# + +This is the base class for the reference models. It defines a set of basic functions that will be called by the uvma_rvfi_reference_model_monitor. It produces the transactions that the uvma_rvfi_reference_model_monitor will use to serve the scoreboard. + +[.underline]#uvma_rvfi_spike# + +uvma_rvfi_spike is a child class of uvma_rvfi_reference_model that implements the necessary functions to execute Spike in instruction-by-instruction mode and retrieve its values. + +[.underline]#uvma_rvfi_reference_model_monitor# + +The main objective of this class is to produce reference model transactions to feed the uvma_rvfi_scoreboard_c. It uses the transactions from the monitor to notify the reference model of asynchronous events such as interruptions and execute instructions. With this information, it executes the committed instructions on the reference model and retrieves the necessary values to send to the scoreboard. + +[.underline]#uvma_rvfi_scoreboard_c# + +This class has the expected functionality following UVM standards. It receives transactions from both the core and the reference model and compares all the required fields to ensure the correct behaviour of the design, focusing mainly on GPR, FPR, and CSRs. It also checks events as interrupts or exceptions. + +*_[.underline]#Axi Agent#_* + +This agent is an AXI4 (Advanced eXtensible Interface) SV UVM1.1 SLAVE. Aligned to AXI4 AMBA spec https://developer.arm.com/documentation/ihi0022/hc + +[.underline]#Agent Architecture:# + +image:./media/image5.png[./media/image5,width=618,height=366] + +[.underline]#Agent components:# + +The AXI4 slave agent provides following components: + +* uvma_axi_agent_c: UVM Agent top file + +* uvma_axi_mon_c: Agent monitor, collects and broadcast transactions to the sequence in each clock. + +* uvma_axi_slv_seq_c: Generates AXI response to master depending on the received transaction from monitor. + +* uvma_axi_sqr_c: Sequencer and Synchronizer, receives responses from reactive sequence and synchronize responses to driver. + +* uvma_axi_cntxt_c: Agent context, instantiate VIF uma_axi_intf and +memory uvml_mem. VIF and Memory are accessible in all components throug context. + +* uva_axi_cfg_c: Agent configuration, all available configuration fields are described in link:#_Configuration_Fields[configuration Fields] + +* uvmt_axi_assert_c: Assertion module banded to the AXI interface. + +* axi_transaction: encapsulates the life cycle of a transaction. It can be used by any component outside the agent that needs information about the AXI transaction. + +[.underline]#Supported features:# + +Only SLAVE mode is supported, features are: + +* Out of order transactions (Private): Transactions with different IDs can complete in any order. + +* Outstanding transactions (Private): Multiple writes or reads transactions run at the same time. + +* Channel delay: Randomize the ready to valid latency, for AW, W and AR channels. + +* Randomize error injection: inject error by randomizing response signal, or by injection error from the sequences. + +* Atomics transactions: the agent support this feature from AXI5. + +* Multiple regions signaling (Private): A region identifier, sent on the address channel for each transaction. If the master doesn't support this feature, the user must configure the memory mapping in the test class. + +* Access permissions (Private): access permissions signals can be used to protect against illegal transactions. If the master doesn't support this feature, the user must configure the memory attribution in the test class and the agent will check the access. + +[.underline]#Agent limitations:# + +The slave axi4 agent does not support: + +* QoS signaling + +* User-defined signaling + +The AXI protocol does not define the functions of these signals. + +[.underline]#Sequences lib:# + +This agent provides 2 sequences: + +. Preload sequence: this sequence initializes the memory with the compiled test. + +. Slave reactive sequence generates the appropriate response after he take the request decoded by the synchronizer. when the response is generated the sequence send it via the driver. + +==== AXI agent improvement + +===== General overview +As part of the TRISTAN project, a collaboration between Thales DIS and CEA was put in place. The goal is to create a complete AXI agent that can function as either a slave or a master. This enhancement is significant as it allows for the verification of the agent with itself in a back-to-back testbench. +A back-to-back testbench is a small UVM testbench where we can connect an IP master with its slave to perform verification. Verifying a complex component like the AXI agent adds credibility to the project and the verification process. +This collaboration is also important for the open-source community, as it enables the agent to be utilized by many people across different projects. + +===== Agent developed by Thales DIS + +It is described in a previous section of this document. + +===== Agent developed by CEA +The AXI superset agent is a highly configurable AXI agent. It is based on the AMBA AXI and ACE Protocol Specification from ARM(https://developer.arm.com/documentation/ihi0022/g/). It provides the functionality of AXI master and AXI slave. In the master mode, it provides the sequences/APIs to drive READ/WRITE/ATOP transactions. In the slave mode, the agent uses an external memory to perform response. + +===== Enhancement +The idea was to merge the master part of the CEA agent with the Thales slave agent and create one agent supporting both slave and master functionality, incorporating all AXI4 features and the potential to support additional functionality from AXI5. The agent must also support external memory in slave mode. +Since the Thales agent was already integrated with CVA6, we used it as a starting point and aligned it with the superset agent to simplify the merge. + +Below is the roadmap agreed upon by the two teams: + +image:./media/axiagentmerge.png[./media/axiagentmerge,width=602,height=285] + +At the time of writing this document, the merge of the two agents is not yet completed. +It remains to publish on GitHub the agent and to integrate it with CVA6. + +*_[.underline]#Interrupt Agent#_* + +This document describes the interrupt agent that plays as a interrupt controler for the CV32A65X, the agent is based on the following protocol: +https://github.com/openhwgroup/cva6/blob/master/verif/docs/Protocols/interrupt-verification.adoc + +[.underline]#Agent Architecture:# + +image:./media/interrupt_uvm_agent.png[./media/image5,width=618,height=366] + +[.underline]#Agent components:# + +The Interrupt agent provides following components: + +* uvma_interrupt_agent_c: UVM Agent. + +* uvma_interrupt_mon_c: Agent monitor, collects and broadcast transactions to the coverage model each time the interrupt interface changes. + +* uvma_interrupt_base_seq_c: Base sequence, instantiate agent configuration & context, and connect it with the sequencer configuration & context. + +* uvma_interrupt_seq_c: Generates interrupt requests & clear them based on the clear protocol decribe in link above. + +* uvma_interrupt_seq_item_c: Have main items of an interrupt transaction, `+interrupt_vector,+`interrupt_channel_mask and interrupt delays. + +* uvma_interrupt_sqr_c: Sequencer, receives requests from the sequence and send it to the driver. + +* uvma_interrupt_drv_c: drive the vif with the requests received from sequencer. + +* uvma_interrupt_cntxt_c: Agent context, instantiate VIF uma_interrupt_intf and memory uvml_mem. VIF and Memory are accessible in all components through context. + +* uvma_interrupt_cfg_c: Agent configuration, all available configuration fields are described in Configuration Fields. + +[.underline]#Supported features:# + +features are: + +* Asynchronous request: the agent support Asynchronously interrupts requests. + +* No channel Dependency: there’s no dependency between the interrupt channels, every one is managed independently. + +* Channel delay: provide delay after setting the interrupt request, also after clear it. + +* Randomize channel: full randomization of setting interrupt request. + +* Timeout: the agent is triggering a `+UVM_FATAL+` after a number of clock cycle if it failed to clear the interrupt request. + +[.underline]#Agent configuration Fields:# + +* is_active: Switch the agent mode to active. The agent support only UVM_ACTIVE mode (can’t be in passive mode). + +* trn_log_enabled: Enabling interrupt transaction logger when 1. + +* enable_interrupt: Enabling sending interrupt request when 1. + +* interrupt_plusarg_valid: Enabling interrupts from commande line request when 1. + +* num_irq_supported: Represent the number of interrupt channels supported. + +* irq_addr: Represent the memory address used by the interrupt clear mechanism. + +* enable_clear_irq: Enabling the interrupt clear mechanism when 1. + +* irq_timeout: Represent the number of clock cyle before the agent trigger a `+UVM_FATAL+` timeout. + +[.underline]#Sequences:# + +This agent provides only one sequence: + +* Set/Clear sequence: this sequence set interrupt request also clear it based on a protocol. + +==== UVCs + +No UVC used in this project + +==== Checkers + +In our environment we use a reference model called *Spike*, to decide if a test Passed or Failed. + +Spike is a functional model implemented in C++ that aims to mimic the behaviour of a RISC-V hart. It implements all the ratified extensions of RISC-V. The verification environment uses this tool as the reference model for core-level verification. + +You can see below how this flow works: + +image:./media/image6.png[./media/image6,width=557,height=252] + +==== Assertions + +*_[.underline]#AXI protocol assertions#_* + +To check the AXI protocol specification, each channel has its own assertion set: + +* AW channel protocol + +* W channel protocol + +* B channel protocol + +* R channel protocol + +* AR channel protocol + +* AMO assertions + +In addition to the channel assertion modules, there is a sixth module where we have implemented assertions that are common to several channels. + +*_[.underline]#AXI CVA6 assertions#_* + +Those assertion are limited to AXI CVA6 support: + +* CVA6 identify read transaction with an ID equal to 0 or 1 + +* CVA6 identify write transaction with an ID equal to 0 or 1 + +* user-defined extension for read address channel is equal to 0b00 + +* user-defined extension for write address channel is equal to 0b00 + +* Quality of Service identifier for write transaction is equal to 0b0000 + +* Quality of Service identifier for read transaction is equal to 0b0000 + +* Region indicator for write transaction is equal to 0b0000 + +* Region indicator for read transaction is equal to 0b0000 + +* AWCACHE is always equal to 0b0000 + +* ARCACHE is always equal to 0b0000 + +* Protection attributes for write transaction always take the 0b000 + +* Protection attributes for read transaction always take the 0b000 + +* all write transaction performed by CVA6 are of type INCR + +* all read transaction performed by CVA6 are of type INCR + +* all write transaction performed by CVA6 are equal to 0 + +* Check if all Read transaction performed by CVA6 are equal to 0 or 1 + +*_[.underline]#CvxIf assertions#_* + +Assertions to check Cvxif protocol: + +* Issue interface protocol assertions + +* Commit interface protocol assertions + +* Result interface protocol assertions + +==== UVM Scoreboard + +The UVM scoreboard is a verification component that contains checkers to verify the functionality of the design. It receives transaction-level objects captured from the interfaces of a DUT via TLM (Transaction-level modeling) analysis ports. +Generally, the scoreboard calculates the expected value using a model and compares it with the actual value captured from the DUT. +For the moment, the CVA6 scoreboard contains a sub-scoreboard for the frontend pipeline stage and checkers for registers. +In the coming months, sub-scoreboards for the other pipeline stages will be added. + +===== Frontend Pipeline Stage Scoreboard + +The frontend pipeline stage scoreboard verifies the CVA6 frontend stage, which includes the fetched instruction, next PC, and the RET/branch prediction. +Scoreboard architecture: + +image:./media/frontendsb.png[./media/frontendsb,width=426,height=93] + +Scoreboard component: + +* Frontend SB: The frontend scoreboard top file is where the model and the monitor are instantiated. It contains counters for fetched data, valid instructions, instruction types, and committed instructions captured from the ISACOV monitor using a TLM port. +* Monitor: Captures signals from the frontend interface and sends transactions to the scoreboard. +* Model: Models the behavior of the frontend by calculating the next program counter to compare it with the actual value. It realigns and pre-decodes the data sent by the cache to store it in the instruction queue and compares it with the instructions sent to the decode stage. It includes RAS and BHT classes to speculate on control flow instructions. + * Instr_realign: This task extracts instructions from the 64-bit blocks coming from the CACHE module. + * Instr_scan: This task pre-decodes the fetched instructions from the instr_realign module. It provides the instruction types: branch, jump, return, jalr, immediate, call, or others. These outputs are used to calculate the prediction address. + * BHT: Class that models the BHT submodule. + * RAS: Class that models the RAS submodule. + * Instr_queue: Class that models the instr_queue submodule. +* Coverage: Multiple coverage models are used to ensure coverage of all possible cases. + +==== Coverage Model + +Our verification environment has functional coverage also, define with several coverage models: + +* *_ISACOV_*: provide functional coverage relate to supported ISA base on the configuration. + +* *_CVXIF_*: provide functional coverage relate CV-XIF protocol, also some custom instructions. + +* *_AXI_*: provide functional coverage relate to AXI protocol. + +To know all the functional coverage related to the CVA6 only is define in the environment, like CVXIF custom instruction, and soma AXI features. + +==== Write or Generate Tests +Testing is a crucial part of the verification process. Different types of tests are employed: + +* Generate assembly tests using CVA6-DV: Uses the RISCV-DV framework to generate assembly-level test programs for the CVA6 core. RISCV-DV is a flexible and extensible tool used for generating RISC-V architecture assembly programs to stress-test the core. The extensions to RISCV-DV are available at (https://github.com/openhwgroup/cva6/tree/master/verif/env/corev-dv). +* Write directed assembly tests: These are manually written test cases designed to target specific behaviors or edge cases that are not covered by automated test generation. + + + +[#_Verification_Plan] +== Verification Plan + +The DVPlans are available at (https://github.com/openhwgroup/cva6/tree/master/verif/docs/VerifPlans). + +=== + +* ISA DVPlan: Focuses on verifying the Instruction Set Architecture (ISA) to ensure the core executes instructions correctly. +* CV-XIF DVPlan: This covers the verification of the Core-View External Interface (CV-XIF), likely to be an interface protocol used for communication between the core and external components. It mentions that this plan is for the first version and will require updates as the protocol evolves. +* AXI DVPlan: Deals with the verification of the AXI (Advanced eXtensible Interface). +* Traps DVPlan: Verifies the behavior of the core when exceptions, interrupts, or traps occur. +* CSRs DVPlan: Focuses on verifying the control and status registers (CSRs), which are key for managing the operation of the core. + +=== + +* Frontend DVPlan: Verifies the frontend part of the core, which involves instruction fetching, next pc generation, pre-decoding instruction and RET/Branch prediction. + +[#_Verification_Reports] +== Verification Reports + +=== Regression Results + +* 2000+ tests running on the server: specifically, 2112 tests are executed as part of the regression process, which runs the entire suite of tests repeatedly to ensure that changes or updates do not introduce new bugs. All tests passed, which indicates a stable design at this stage. + +[cols="1,1,1,1,1"] +|=== +| +|ISA +|Traps +|CSRs +|Data hazard + +|Generated tests +|900 +|449 +|450 +|300 + +|Directed tests +|6 +|2 +|5 +|0 +|=== + +* Scripts handling failed tests: A script is used to automatically remove failing tests from the coverage database. This ensures that the coverage report is based solely on tests that pass, meaning the coverage metrics are representative of correctly functioning parts of the design. + + +=== Functional coverage + +* Create HVP in the CVA6 env to track functional coverage: HVP refers to "Hierarchical Verification Plan," which helps monitor and track the functional coverage of the tests. The goal is to ensure that all functional aspects of the core are exercised during the verification. Functional coverage measures how much of the design’s intended functionality has been exercised. +The functional coverage is split in several parts: + + * Programmer view level: it corresponds to CVA6 from the perspective of the programmer (architectural view): ISA, CSRs, and traps. + * Design level: it corresponds to the different parts of the CVA6 pipeline (micro-architectural view). + +image:./media/hvp.png[./media/hvp,width=624,height=190] + +* 98.09% functional coverage for programmer view level: Achieving 98.09% coverage is an excellent result, indicating that most of the ISA (Instruction Set Architecture), CSRs (Control and Status Registers), and traps (interrupt and exception handling) have been tested thoroughly. + +image:./media/funccovresults.png[./media/funccovresults,width=426,height=93] + +* Justificative report for coverage holes: This refers to generating a report explaining any missing coverage (the 1.91% not covered). This might happen due to untested corner cases or unsupported configurations. + +* Regarding the design level, the 6.65% coverage result is poor as only the frontend stage of the CVA6 pipeline is currently addressed. + + +=== Code coverage + +* The coverage of line and condition was low, primarily due to unsupported features in the CV32A65X configuration. +* The solution was to parameterize the RTL to make it configurable, remove dead code from the coverage report, and eliminate dead gates from the netlist. +* To achieve this, we used the VCS switch “-cm_seqnoconst -diag noconst” to automatically exclude constant variables and inaccessible code from the coverage analysis. +* An example of the implementation is provided below: + +image:./media/rtlparam.png[./media/rtlparam,width=624,height=161] + +* The gain of parameterization for only MMU and FPU: + +image:./media/rtlparamgates.png[./media/rtlparamgates,width=975,height=92] + +* We encountered the same problem with toggle coverage, and the score was very low compared to that of line and condition coverage. +* We can’t use the same options applied for line and condition coverage because we can’t parameterize the interface signals. Additionally, the VCS simulator only tracks constant signals if they are directly assigned to a constant value. +* For example, if the aw_user signal from the AXI interface is directly assigned to '0', the simulator will exclude the signal from toggle coverage. However, if aw_user is assigned to another signal, even if that signal is constant, the simulator will take no action. +* To address this, the solution is to generate an exclusion file based on the results of a Python script that detects unsupported signals according to your configuration. (More details about the script) +* Low toggle coverage initially (50%): Toggle coverage refers to how often signals switch between states during simulation. Initially, only 50% of the signals in the design were toggling, which suggests that a significant portion of the design was not being exercised. + +image:./media/toggleresults.png[./media/toggleresults,width=241,height=70] + +* Issue with unsupported signals for configuration: The problem was that some signals were unsupported in the current design configuration. A Python script was used to identify these unsupported signals, and an exclusion file was generated to exclude these signals from code coverage, ensuring accurate reporting. + +==== What is missing in code coverage + +* CV-XIF tests: An update of the Core-View External Interface (CV-X-IF) protocol (version 1.0) is available, and the verification tests hadn’t been updated to reflect these changes. This gap in testing impacts code coverage, as some parts of the design using this protocol may remain untested. +The verification work of CV-X-IF v1.0 will be done in the ISOLDE project. +* Dead code: There is still some "dead code" in the design, code that is never executed. This could indicate unnecessary or outdated features that need to be either removed or refactored. + +=== SpyGlass integration + +SpyGlass is a static analysis tool used to find potential issues in RTL code, such as linting errors, coding style violations, and design rule checks. The verification effort involved: + +* Adding SpyGlass support from scratch: This included setting up scripts, Makefiles, and other infrastructure to integrate SpyGlass into the CVA6 verification environment. +* Integrating SpyGlass into regression tests: SpyGlass was included in the regression process so that any RTL changes would automatically be checked for issues. +* Reporting results in a dashboard: The results of the SpyGlass runs were compared with previous runs, and reports were generated to track progress, likely in a dashboard for easy visualization of errors and trends. + +=== RTL issues detected + +RTL (Register Transfer Level) bugs have been detected in various areas of the CVA6 design thanks to verification: + +* ISA bugs: Bugs in the instruction set implementation. +* Traps: Bugs in how the core handles exceptions and interrupts. +* CSRs: Bugs related to control and status registers. +* CV-X-IF: Bugs related to the CV-X-IF protocol. +* AXI: Bugs related to the AXI protocol. + +All issues are described in CVA6 GitHub repository (https://github.com/openhwgroup/cva6/issues?q=label%3AType%3ABug). + +http://www.tristan-project.eu[_www.tristan-project.eu_] + +_info@tristan-project.eu_ + + +[width="100%",cols="28%,72%",options="header",] + +|=== + +|image:./media/image7.png[Graphical user interface, application Description automatically generated,width=173,height=80] |_TRISTAN has received funding from the Key Digital Technologies Joint Undertaking (KDT JU) under grant agreement nr. 101095947. The KDT JU receives support from the European Union’s Horizon Europe’s research and innovation programme and Austria, Belgium, Bulgaria, Croatia, Cyprus, Czechia, Germany, Denmark, Estonia, Greece, Spain, Finland, France, Hungary, Ireland, Israel, Iceland, Italy, Lithuania, Luxembourg, Latvia, Malta, Netherlands, Norway, Poland, Portugal, Romania, Sweden, Slovenia, Slovakia, Turkey. +|=== diff --git a/docs/06_cv64a6_mmu/index.rst b/docs/06_cv64a6_mmu/index.rst index be609f4f0d..6adf3e3315 100644 --- a/docs/06_cv64a6_mmu/index.rst +++ b/docs/06_cv64a6_mmu/index.rst @@ -1,4 +1,4 @@ -CV32A65X documentation +CV64A6_MMU documentation ====================== .. toctree:: diff --git a/docs/06_cv64a6_mmu/riscv/priv-isa-cv64a6_mmu.html b/docs/06_cv64a6_mmu/riscv/priv-isa-cv64a6_mmu.html deleted file mode 100644 index ee255cc89d..0000000000 --- a/docs/06_cv64a6_mmu/riscv/priv-isa-cv64a6_mmu.html +++ /dev/null @@ -1,5910 +0,0 @@ - - - - - - - - -The RISC-V Instruction Set Manual for CV64A6_MMU: Volume II: Privileged Architecture - - - - - - -
-
-
-
-

This document describes the RISC-V privileged architecture tailored for -OpenHW Group CV64A6_MMU. -Not relevant parts (e.g. unsupported extensions) of the original -specification are replaced by placeholders.

-
-
-

Contributors to all versions of the spec in alphabetical order (please contact -editors to suggest corrections): Krste Asanović, Peter Ashenden, Rimas -Avižienis, Jacob Bachmeyer, Allen J. Baum, Jonathan Behrens, Paolo Bonzini, Ruslan Bukin, -Christopher Celio, Chuanhua Chang, David Chisnall, Anthony Coulter, Palmer Dabbelt, Monte -Dalrymple, Paul Donahue, Greg Favor, Dennis Ferguson, Marc Gauthier, Andy Glew, -Gary Guo, Mike Frysinger, John Hauser, David Horner, Olof -Johansson, David Kruckemyer, Yunsup Lee, Daniel Lustig, Andrew Lutomirski, Prashanth Mundkur, -Jonathan Neuschäfer, Rishiyur -Nikhil, Stefan O’Rear, Albert Ou, John Ousterhout, David Patterson, Dmitri -Pavlov, Kade Phillips, Josh Scheid, Colin Schmidt, Michael Taylor, Wesley Terpstra, Matt Thomas, Tommy Thorn, Ray -VanDeWalker, Megan Wachs, Steve Wallach, Andrew Waterman, Claire Wolf, -and Reinoud Zandijk..

-
-
-

This document is released under a Creative Commons Attribution 4.0 International License.

-
-
-

This document is a derivative of the RISC-V -privileged specification version 1.9.1 released under following license: ©2010-2017 Andrew Waterman, Yunsup Lee, Rimas -Avižienis, -David Patterson, Krste Asanović. Creative Commons Attribution 4.0 International License.

-
-
-

Contributors to CV64A6_MMU versions of the spec in alphabetical order: -Jean-Roch Coulon, André Sintzoff.

-
-
-
-
-

Preface

-
-
-

Preface to Version for CV64A6_MMU

-
-
-

This document describes the RISC-V privileged architecture tailored for -OpenHW Group CV64A6_MMU.

-
-
-

Preface to Version 20240703

-
-
-

This document describes the RISC-V privileged architecture. This -release, version 20240703, contains the following versions of the RISC-V ISA -modules:

-
- ----- - - - - - - - - - - - - - - -
ModuleVersionStatus

Machine ISA
-Smstateen Extension
-Smcsrind/Sscsrind Extension
-Smepmp
-Smcntrpmf
-Smrnmi Extension
-Smcdeleg
-Smdbltrp
-Supervisor ISA
-Svade Extension
-Svnapot Extension
-Svpbmt Extension
-Svinval Extension
-Svadu Extension
-Sstc
-Sscofpmf
-Ssdbltrp
-Hypervisor ISA
-Shlcofideleg
-Svvptc

1.13
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.13
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-1.0
-0.1
-1.0

Draft
-Ratified
-Ratified
-Ratified
-Ratified
-Ratified
-Ratified
-Draft
-Draft
-Ratified
-Ratified
-Ratified
-Ratified
-Ratified
-Ratified
-Ratified
-Draft
-Ratified
-Draft
-Ratified

-
-

The following changes have been made since version 1.12 of the Machine and -Supervisor ISAs, which, while not strictly backwards compatible, are not -anticipated to cause software portability problems in practice:

-
-
-
    -
  • -

    Redefined misa.MXL to be read-only, making MXLEN a constant.

    -
  • -
  • -

    Added the constraint that SXLEN≥UXLEN.

    -
  • -
-
-
-

Additionally, the following compatible changes have been -made to the Machine and Supervisor ISAs since version 1.12:

-
-
-
    -
  • -

    Defined the misa.B field to reflect that the B extension has been -implemented.

    -
  • -
  • -

    Defined the misa.V field to reflect that the V extension has been -implemented.

    -
  • -
  • -

    Defined the RV32-only medelegh and hedelegh CSRs.

    -
  • -
  • -

    Defined the misaligned atomicity granule PMA, superseding the proposed Zam -extension.

    -
  • -
  • -

    Allocated interrupt 13 for Sscofpmf LCOFI interrupt.

    -
  • -
  • -

    Defined hardware error and software check exception codes.

    -
  • -
  • -

    Specified synchronization requirements when changing the PBMTE fields -in menvcfg and henvcfg.

    -
  • -
  • -

    Exposed count-overflow interrups to VS-mode via the Shlcofideleg extension.

    -
  • -
  • -

    Relaxed behavior of some HINTs when MXLEN > XLEN.

    -
  • -
-
-
-

Finally, the following clarifications and document improvments have been made -since the last document release:

-
-
-
    -
  • -

    Transliterated the document from LaTeX into AsciiDoc.

    -
  • -
  • -

    Included all ratified extensions through March 2024.

    -
  • -
  • -

    Clarified that "platform- or custom-use" interrupts are actually -"platform-use interrupts", where the platform can choose to make some custom.

    -
  • -
  • -

    Clarified semantics of explicit accesses to CSRs wider than XLEN bits.

    -
  • -
  • -

    Clarified that MXLEN≥SXLEN.

    -
  • -
  • -

    Clarified that WFI is not a HINT instruction.

    -
  • -
  • -

    Clarified that VS-stage page-table accesses set G-stage A/D bits.

    -
  • -
  • -

    Clarified ordering rules when PBMT=IO is used on main-memory regions.

    -
  • -
  • -

    Clarified ordering rules for hardware A/D bit updates.

    -
  • -
  • -

    Clarified that, for a given exception cause, xtval might sometimes -be set to a nonzero value but sometimes not.

    -
  • -
  • -

    Clarified exception behavior of unimplemented or inaccessible CSRs.

    -
  • -
  • -

    Clarified that Svpbmt allows implementations to override additional PMAs.

    -
  • -
  • -

    Replaced the concept of vacant memory regions with inaccessible memory or I/O regions.

    -
  • -
  • -

    Clarified that timer and count-overflow interrupts' arrival in -interrupt-pending registers is not immediate.

    -
  • -
-
-
-

Preface to Version 20211203

-
-
-

This document describes the RISC-V privileged architecture. This -release, version 20211203, contains the following versions of the RISC-V -ISA modules:

-
- ----- - - - - - - - - - - - - - - -
ModuleVersionStatus

Machine ISA
-Supervisor ISA
-Svnapot Extension
-Svpbmt Extension
-Svinval Extension
-Hypervisor ISA

1.12
-1.12
-1.0
-1.0
-1.0
-1.0

Ratified
-Ratified
-Ratified
-Ratified
-Ratified
-Ratified

-
-

The following changes have been made since version 1.11, which, while -not strictly backwards compatible, are not anticipated to cause software -portability problems in practice:

-
-
-
    -
  • -

    Changed MRET and SRET to clear mstatus.MPRV when leaving M-mode.

    -
  • -
  • -

    Reserved additional satp patterns for future use.

    -
  • -
  • -

    Stated that the scause Exception Code field must implement bits 4–0 -at minimum.

    -
  • -
  • -

    Relaxed I/O regions have been specified to follow RVWMO. The previous -specification implied that PPO rules other than fences and -acquire/release annotations did not apply.

    -
  • -
  • -

    Constrained the LR/SC reservation set size and shape when using -page-based virtual memory.

    -
  • -
  • -

    PMP changes require an SFENCE.VMA on any hart that implements -page-based virtual memory, even if VM is not currently enabled.

    -
  • -
  • -

    Allowed for speculative updates of page table entry A bits.

    -
  • -
  • -

    Clarify that if the address-translation algorithm non-speculatively -reaches a PTE in which a bit reserved for future standard use is set, a -page-fault exception must be raised.

    -
  • -
-
-
-

Additionally, the following compatible changes have been made since -version 1.11:

-
-
-
    -
  • -

    Removed the N extension.

    -
  • -
  • -

    Defined the mandatory RV32-only CSR mstatush, which contains most of -the same fields as the upper 32 bits of RV64’s mstatus.

    -
  • -
  • -

    Defined the mandatory CSR mconfigptr, which if nonzero contains the -address of a configuration data structure.

    -
  • -
  • -

    Defined optional mseccfg and mseccfgh CSRs, which control the -machine’s security configuration.

    -
  • -
  • -

    Defined menvcfg, henvcfg, and senvcfg CSRs (and RV32-only -menvcfgh and henvcfgh CSRs), which control various characteristics -of the execution environment.

    -
  • -
  • -

    Designated part of SYSTEM major opcode for custom use.

    -
  • -
  • -

    Permitted the unconditional delegation of less-privileged interrupts.

    -
  • -
  • -

    Added optional big-endian and bi-endian support.

    -
  • -
  • -

    Made priority of load/store/AMO address-misaligned exceptions -implementation-defined relative to load/store/AMO page-fault and -access-fault exceptions.

    -
  • -
  • -

    PMP reset values are now platform-defined.

    -
  • -
  • -

    An additional 48 optional PMP registers have been defined.

    -
  • -
  • -

    Slightly relaxed the atomicity requirement for A and D bit updates -performed by the implementation.

    -
  • -
  • -

    Clarify the architectural behavior of address-translation caches

    -
  • -
  • -

    Added Sv57 and Sv57x4 address translation modes.

    -
  • -
  • -

    Software breakpoint exceptions are permitted to write either 0 or the -pc to xtval.

    -
  • -
  • -

    Clarified that bare S-mode need not support the SFENCE.VMA -instruction.

    -
  • -
  • -

    Specified relaxed constraints for implicit reads of non-idempotent -regions.

    -
  • -
  • -

    Added the Svnapot Standard Extension, along with the N bit in Sv39, -Sv48, and Sv57 PTEs.

    -
  • -
  • -

    Added the Svpbmt Standard Extension, along with the PBMT bits in Sv39, -Sv48, and Sv57 PTEs.

    -
  • -
  • -

    Added the Svinval Standard Extension and associated instructions.

    -
  • -
-
-
-

Finally, the hypervisor architecture proposal has been extensively -revised.

-
-
-

Preface to Version 1.11

-
-
-

This is version 1.11 of the RISC-V privileged architecture. The document -contains the following versions of the RISC-V ISA modules:

-
- ----- - - - - - - - - - - - - - - -
ModuleVersionStatus

Machine ISA
-Supervisor ISA
-Hypervisor ISA

1.11
-1.11
-0.3

Ratified
-Ratified
-Draft

-
-

Changes from version 1.10 include:

-
-
-
    -
  • -

    Moved Machine and Supervisor spec to Ratified status.

    -
  • -
  • -

    Improvements to the description and commentary.

    -
  • -
  • -

    Added a draft proposal for a hypervisor extension.

    -
  • -
  • -

    Specified which interrupt sources are reserved for standard use.

    -
  • -
  • -

    Allocated some synchronous exception causes for custom use.

    -
  • -
  • -

    Specified the priority ordering of synchronous exceptions.

    -
  • -
  • -

    Added specification that xRET instructions may, but are not required -to, clear LR reservations if A extension present.

    -
  • -
  • -

    The virtual-memory system no longer permits supervisor mode to execute -instructions from user pages, regardless of the SUM setting.

    -
  • -
  • -

    Clarified that ASIDs are private to a hart, and added commentary about -the possibility of a future global-ASID extension.

    -
  • -
  • -

    SFENCE.VMA semantics have been clarified.

    -
  • -
  • -

    Made the mstatus.MPP field WARL, rather than WLRL.

    -
  • -
  • -

    Made the unused xip fields WPRI, rather than WIRI.

    -
  • -
  • -

    Made the unused misa fields WARL, rather than WIRI.

    -
  • -
  • -

    Made the unused pmpaddr and pmpcfg fields WARL, rather than WIRI.

    -
  • -
  • -

    Required all harts in a system to employ the same PTE-update scheme as -each other.

    -
  • -
  • -

    Rectified an editing error that misdescribed the mechanism by which -mstatus.xIE is written upon an exception.

    -
  • -
  • -

    Described scheme for emulating misaligned AMOs.

    -
  • -
  • -

    Specified the behavior of the misa and xepc registers in systems -with variable IALIGN.

    -
  • -
  • -

    Specified the behavior of writing self-contradictory values to the -misa register.

    -
  • -
  • -

    Defined the mcountinhibit CSR, which stops performance counters from -incrementing to reduce energy consumption.

    -
  • -
  • -

    Specified semantics for PMP regions coarser than four bytes.

    -
  • -
  • -

    Specified contents of CSRs across XLEN modification.

    -
  • -
  • -

    Moved PLIC chapter into its own document.

    -
  • -
-
-
-

Preface to Version 1.10

-
-
-

This is version 1.10 of the RISC-V privileged architecture proposal. -Changes from version 1.9.1 include:

-
-
-
    -
  • -

    The previous version of this document was released under a Creative -Commons Attribution 4.0 International License by the original authors, -and this and future versions of this document will be released under the -same license.

    -
  • -
  • -

    The explicit convention on shadow CSR addresses has been removed to -reclaim CSR space. Shadow CSRs can still be added as needed.

    -
  • -
  • -

    The mvendorid register now contains the JEDEC code of the core -provider as opposed to a code supplied by the Foundation. This avoids -redundancy and offloads work from the Foundation.

    -
  • -
  • -

    The interrupt-enable stack discipline has been simplified.

    -
  • -
  • -

    An optional mechanism to change the base ISA used by supervisor and -user modes has been added to the mstatus CSR, and the field previously -called Base in misa has been renamed to MXL for consistency.

    -
  • -
  • -

    Clarified expected use of XS to summarize additional extension state -status fields in mstatus.

    -
  • -
  • -

    Optional vectored interrupt support has been added to the mtvec and -stvec CSRs.

    -
  • -
  • -

    The SEIP and UEIP bits in the mip CSR have been redefined to support -software injection of external interrupts.

    -
  • -
  • -

    The mbadaddr register has been subsumed by a more general mtval -register that can now capture bad instruction bits on an illegal -instruction fault to speed instruction emulation.

    -
  • -
  • -

    The machine-mode base-and-bounds translation and protection schemes -have been removed from the specification as part of moving the virtual -memory configuration to sptbr (now satp). Some of the motivation for -the base and bound schemes are now covered by the PMP registers, but -space remains available in mstatus to add these back at a later date -if deemed useful.

    -
  • -
  • -

    In systems with only M-mode, or with both M-mode and U-mode but -without U-mode trap support, the medeleg and mideleg registers now -do not exist, whereas previously they returned zero.

    -
  • -
  • -

    Virtual-memory page faults now have mcause values distinct from -physical-memory access faults. Page-fault exceptions can now be -delegated to S-mode without delegating exceptions generated by PMA and -PMP checks.

    -
  • -
  • -

    An optional physical-memory protection (PMP) scheme has been proposed.

    -
  • -
  • -

    The supervisor virtual memory configuration has been moved from the -mstatus register to the sptbr register. Accordingly, the sptbr -register has been renamed to satp (Supervisor Address Translation and -Protection) to reflect its broadened role.

    -
  • -
  • -

    The SFENCE.VM instruction has been removed in favor of the improved -SFENCE.VMA instruction.

    -
  • -
  • -

    The mstatus bit MXR has been exposed to S-mode via sstatus.

    -
  • -
  • -

    The polarity of the PUM bit in sstatus has been inverted to shorten -code sequences involving MXR. The bit has been renamed to SUM.

    -
  • -
  • -

    Hardware management of page-table entry Accessed and Dirty bits has -been made optional; simpler implementations may trap to software to set -them.

    -
  • -
  • -

    The counter-enable scheme has changed, so that S-mode can control -availability of counters to U-mode.

    -
  • -
  • -

    H-mode has been removed, as we are focusing on recursive -virtualization support in S-mode. The encoding space has been reserved -and may be repurposed at a later date.

    -
  • -
  • -

    A mechanism to improve virtualization performance by trapping S-mode -virtual-memory management operations has been added.

    -
  • -
  • -

    The Supervisor Binary Interface (SBI) chapter has been removed, so -that it can be maintained as a separate specification.

    -
  • -
-
-
-

Preface to Version 1.9.1

-
-
-

This is version 1.9.1 of the RISC-V privileged architecture proposal. -Changes from version 1.9 include:

-
-
-
    -
  • -

    Numerous additions and improvements to the commentary sections.

    -
  • -
  • -

    Change configuration string proposal to be use a search process that -supports various formats including Device Tree String and flattened -Device Tree.

    -
  • -
  • -

    Made misa optionally writable to support modifying base and -supported ISA extensions. CSR address of misa changed.

    -
  • -
  • -

    Added description of debug mode and debug CSRs.

    -
  • -
  • -

    Added a hardware performance monitoring scheme. Simplified the -handling of existing hardware counters, removing privileged versions of -the counters and the corresponding delta registers.

    -
  • -
  • -

    Fixed description of SPIE in presence of user-level interrupts.

    -
  • -
-
-
-
-
-

1. Introduction

-
-
-

This document describes the RISC-V privileged architecture, which covers -all aspects of RISC-V systems beyond the unprivileged ISA, including -privileged instructions as well as additional functionality required for -running operating systems and attaching external devices.

-
-
- - - - - -
- - -
-

Commentary on our design decisions is formatted as in this paragraph, -and can be skipped if the reader is only interested in the specification -itself.

-
-
-
-

We briefly note that the entire privileged-level design described in -this document could be replaced with an entirely different -privileged-level design without changing the unprivileged ISA, and -possibly without even changing the ABI. In particular, this privileged -specification was designed to run existing popular operating systems, -and so embodies the conventional level-based protection model. Alternate -privileged specifications could embody other more flexible -protection-domain models. For simplicity of expression, the text is -written as if this was the only possible privileged architecture.

-
-
-
-
-

1.1. RISC-V Privileged Software Stack Terminology

-
-

This section describes the terminology we use to describe components of -the wide range of possible privileged software stacks for RISC-V.

-
-
-

Figure 1 shows some of the possible software stacks -that can be supported by the RISC-V architecture. The left-hand side -shows a simple system that supports only a single application running on -an application execution environment (AEE). The application is coded to -run with a particular application binary interface (ABI). The ABI -includes the supported user-level ISA plus a set of ABI calls to -interact with the AEE. The ABI hides details of the AEE from the -application to allow greater flexibility in implementing the AEE. The -same ABI could be implemented natively on multiple different host OSs, -or could be supported by a user-mode emulation environment running on a -machine with a different native ISA.

-
-
- - - - - -
- - -
-

Our graphical convention represents abstract interfaces using black -boxes with white text, to separate them from concrete instances of -components implementing the interfaces.

-
-
-
-
-
-privimps -
-
Figure 1. Different implementation stacks supporting various forms of privileged execution.
-
-
-

The middle configuration shows a conventional operating system (OS) that -can support multiprogrammed execution of multiple applications. Each -application communicates over an ABI with the OS, which provides the -AEE. Just as applications interface with an AEE via an ABI, RISC-V -operating systems interface with a supervisor execution environment -(SEE) via a supervisor binary interface (SBI). An SBI comprises the -user-level and supervisor-level ISA together with a set of SBI function -calls. Using a single SBI across all SEE implementations allows a single -OS binary image to run on any SEE. The SEE can be a simple boot loader -and BIOS-style IO system in a low-end hardware platform, or a -hypervisor-provided virtual machine in a high-end server, or a thin -translation layer over a host operating system in an architecture -simulation environment.

-
-
- - - - - -
- - -
-

Most supervisor-level ISA definitions do not separate the SBI from the -execution environment and/or the hardware platform, complicating -virtualization and bring-up of new hardware platforms.

-
-
-
-
-

The rightmost configuration shows a virtual machine monitor -configuration where multiple multiprogrammed OSs are supported by a -single hypervisor. Each OS communicates via an SBI with the hypervisor, -which provides the SEE. The hypervisor communicates with the hypervisor -execution environment (HEE) using a hypervisor binary interface (HBI), -to isolate the hypervisor from details of the hardware platform.

-
-
- - - - - -
- - -
-

The ABI, SBI, and HBI are still a work-in-progress, but we are now -prioritizing support for Type-2 hypervisors where the SBI is provided -recursively by an S-mode OS.

-
-
-
-
-

Hardware implementations of the RISC-V ISA will generally require -additional features beyond the privileged ISA to support the various -execution environments (AEE, SEE, or HEE).

-
-
-
-

1.2. Privilege Levels

-
-

At any time, a RISC-V hardware thread (hart) is running at some -privilege level encoded as a mode in one or more CSRs (control and -status registers). Three RISC-V privilege levels are currently defined -as shown in Table 1.

-
- - ------ - - - - - - - - - - - - - - - - -
Table 1. RISC-V privilege levels.
LevelEncodingNameAbbreviation

0
-1
-2
-3

00
-01
-10
-11

User/Application
-Supervisor
-Reserved
-Machine

U
-S

-M

-
-

Privilege levels are used to provide protection between different -components of the software stack, and attempts to perform operations not -permitted by the current privilege mode will cause an exception to be -raised. These exceptions will normally cause traps into an underlying -execution environment.

-
-
- - - - - -
- - -
-

In the description, we try to separate the privilege level for which -code is written, from the privilege mode in which it runs, although the -two are often tied. For example, a supervisor-level operating system can -run in supervisor-mode on a system with three privilege modes, but can -also run in user-mode under a classic virtual machine monitor on systems -with two or more privilege modes. In both cases, the same -supervisor-level operating system binary code can be used, coded to a -supervisor-level SBI and hence expecting to be able to use -supervisor-level privileged instructions and CSRs. When running a guest -OS in user mode, all supervisor-level actions will be trapped and -emulated by the SEE running in the higher-privilege level.

-
-
-
-
-

The machine level has the highest privileges and is the only mandatory -privilege level for a RISC-V hardware platform. Code run in machine-mode -(M-mode) is usually inherently trusted, as it has low-level access to -the machine implementation. M-mode can be used to manage secure -execution environments on RISC-V. User-mode (U-mode) and supervisor-mode -(S-mode) are intended for conventional application and operating system -usage respectively.

-
-
-

Each privilege level has a core set of privileged ISA extensions with -optional extensions and variants. For example, machine-mode supports an -optional standard extension for memory protection. Also, supervisor mode -can be extended to support Type-2 hypervisor execution as described in -Chapter 14.

-
-
-

Implementations might provide anywhere from 1 to 3 privilege modes -trading off reduced isolation for lower implementation cost, as shown in -Table 2.

-
- - ----- - - - - - - - - - - - - - - -
Table 2. Supported combination of privilege modes.
Number of levelsSupported ModesIntended Usage

1
-2
-3

M
-M, U
-M, S, U

Simple embedded systems
-Secure embedded systems
-Systems running Unix-like operating systems

-
-

All hardware implementations must provide M-mode, as this is the only -mode that has unfettered access to the whole machine. The simplest -RISC-V implementations may provide only M-mode, though this will provide -no protection against incorrect or malicious application code.

-
-
- - - - - -
- - -
-

The lock feature of the optional PMP facility can provide some limited -protection even with only M-mode implemented.

-
-
-
-
-

Many RISC-V implementations will also support at least user mode -(U-mode) to protect the rest of the system from application code. -Supervisor mode (S-mode) can be added to provide isolation between a -supervisor-level operating system and the SEE.

-
-
-

A hart normally runs application code in U-mode until some trap (e.g., a -supervisor call or a timer interrupt) forces a switch to a trap handler, -which usually runs in a more privileged mode. The hart will then execute -the trap handler, which will eventually resume execution at or after the -original trapped instruction in U-mode. Traps that increase privilege -level are termed vertical traps, while traps that remain at the same -privilege level are termed horizontal traps. The RISC-V privileged -architecture provides flexible routing of traps to different privilege -layers.

-
-
- - - - - -
- - -
-

Horizontal traps can be implemented as vertical traps that return -control to a horizontal trap handler in the less-privileged mode.

-
-
-
-
-
-

1.3. Debug Mode

-
-

Implementations may also include a debug mode to support off-chip -debugging and/or manufacturing test. Debug mode (D-mode) can be -considered an additional privilege mode, with even more access than -M-mode. The separate debug specification proposal describes operation of -a RISC-V hart in debug mode. Debug mode reserves a few CSR addresses -that are only accessible in D-mode, and may also reserve some portions -of the physical address space on a platform.

-
-
-
-
-
-

2. Control and Status Registers (CSRs)

-
-
-

The SYSTEM major opcode is used to encode all privileged instructions in -the RISC-V ISA. These can be divided into two main classes: those that -atomically read-modify-write control and status registers (CSRs), which -are defined in the Zicsr extension, and all other privileged -instructions. The privileged architecture requires the Zicsr extension; -which other privileged instructions are required depends on the -privileged-architecture feature set.

-
-
-

In addition to the unprivileged state described in Volume I of this -manual, an implementation may contain additional CSRs, accessible by -some subset of the privilege levels using the CSR instructions described -in Volume I. In this chapter, we map out the CSR address space. The -following chapters describe the function of each of the CSRs according -to privilege level, as well as the other privileged instructions which -are generally closely associated with a particular privilege level. Note -that although CSRs and instructions are associated with one privilege -level, they are also accessible at all higher privilege levels.

-
-
-

Standard CSRs do not have side effects on reads but may have side -effects on writes.

-
-
-

2.1. CSR Address Mapping Conventions

-
-

The standard RISC-V ISA sets aside a 12-bit encoding space (csr[11:0]) -for up to 4,096 CSRs. By convention, the upper 4 bits of the CSR address -(csr[11:8]) are used to encode the read and write accessibility of the -CSRs according to privilege level as shown in Table 3. The top two bits (csr[11:10]) indicate whether the register is read/write (00,01, or 10) or read-only (11). The next two bits (csr[9:8]) encode the lowest privilege level that can access the CSR.

-
-
- - - - - -
- - -
-

The CSR address convention uses the upper bits of the CSR address to -encode default access privileges. This simplifies error checking in the -hardware and provides a larger CSR space, but does constrain the mapping -of CSRs into the address space.

-
-
-

Implementations might allow a more-privileged level to trap otherwise -permitted CSR accesses by a less-privileged level to allow these -accesses to be intercepted. This change should be transparent to the -less-privileged software.

-
-
-
-
-

Instructions that access a non-existent CSR are reserved. -Attempts to access a CSR without appropriate privilege level -raise illegal-instruction exceptions or, as described in -[sec:hcauses], virtual-instruction exceptions. -Attempts to write a read-only register raise illegal-instruction exceptions. -A read/write register might also contain some bits that are -read-only, in which case writes to the read-only bits are ignored.

-
-
-

Table 3 also indicates the convention to -allocate CSR addresses between standard and custom uses. The CSR -addresses designated for custom uses will not be redefined by future -standard extensions.

-
-
-

Machine-mode standard read-write CSRs 0x7A0-0x7BF are reserved for -use by the debug system. Of these CSRs, 0x7A0-0x7AF are accessible -to machine mode, whereas 0x7B0-0x7BF are only visible to debug mode. -Implementations should raise illegal-instruction exceptions on -machine-mode access to the latter set of registers.

-
-
- - - - - -
- - -
-

Effective virtualization requires that as many instructions run natively -as possible inside a virtualized environment, while any privileged -accesses trap to the virtual machine monitor. (Goldberg, 1974) CSRs that are read-only -at some lower privilege level are shadowed into separate CSR addresses -if they are made read-write at a higher privilege level. This avoids -trapping permitted lower-privilege accesses while still causing traps on -illegal accesses. Currently, the counters are the only shadowed CSRs.

-
-
-
-
-
-

2.2. CSR Listing

-
-

Table 4-Table 8 list the CSRs that -have currently been allocated CSR addresses. The timers, counters, and -floating-point CSRs are standard unprivileged CSRs. The other registers -are used by privileged code, as described in the following chapters. -Note that not all registers are required on all implementations.

-
- - ---------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 3. Allocation of RISC-V CSR address ranges.

CSR Address

Hex

Use and Accessibility

[11:10]

[9:8]

[7:4]

Unprivileged and User-Level CSRs

00

00

XXXX

0x000-0x0FF

Standard read/write

01

00

XXXX

0x400-0x4FF

Standard read/write

10

00

XXXX

0x800-0x8FF

Custom read/write

11

00

0XXX

0xC00-0xC7F

Standard read-only

11

00

10XX

0xC80-0xCBF

Standard read-only

11

00

11XX

0xCC0-0xCFF

Custom read-only

Supervisor-Level CSRs

00

01

XXXX

0x100-0x1FF

Standard read/write

01

01

0XXX

0x500-0x57F

Standard read/write

01

01

10XX

0x580-0x5BF

Standard read/write

01

01

11XX

0x5C0-0x5FF

Custom read/write

10

01

0XXX

0x900-0x97F

Standard read/write

10

01

10XX

0x980-0x9BF

Standard read/write

10

01

11XX

0x9C0-0x9FF

Custom read/write

11

01

0XXX

0xD00-0xD7F

Standard read-only

11

01

10XX

0xD80-0xDBF

Standard read-only

11

01

11XX

0xDC0-0xDFF

Custom read-only

Hypervisor and VS CSRs

00

10

XXXX

0x200-0x2FF

Standard read/write

01

10

0XXX

0x600-0x67F

Standard read/write

01

10

10XX

0x680-0x6BF

Standard read/write

01

10

11XX

0x6C0-0x6FF

Custom read/write

10

10

0XXX

0xA00-0xA7F

Standard read/write

10

10

10XX

0xA80-0xABF

Standard read/write

10

10

11XX

0xAC0-0xAFF

Custom read/write

11

10

0XXX

0xE00-0xE7F

Standard read-only

11

10

10XX

0xE80-0xEBF

Standard read-only

11

10

11XX

0xEC0-0xEFF

Custom read-only

Machine-Level CSRs

00

11

XXXX

0x300-0x3FF

Standard read/write

01

11

0XXX

0x700-0x77F

Standard read/write

01

11

100X

0x780-0x79F

Standard read/write

01

11

1010

0x7A0-0x7AF

Standard read/write debug CSRs

01

11

1011

0x7B0-0x7BF

Debug-mode-only CSRs

01

11

11XX

0x7C0-0x7FF

Custom read/write

10

11

0XXX

0xB00-0xB7F

Standard read/write

10

11

10XX

0xB80-0xBBF

Standard read/write

10

11

11XX

0xBC0-0xBFF

Custom read/write

11

11

0XXX

0xF00-0xF7F

Standard read-only

11

11

10XX

0xF80-0xFBF

Standard read-only

11

11

11XX

0xFC0-0xFFF

Custom read-only

-
- - ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 4. Currently allocated RISC-V unprivileged CSR addresses.
NumberPrivilegeNameDescription

Unprivileged Floating-Point CSRs

0x001
-0x002
-0x003

URW
-URW
-URW

fflags
-frm
-fcsr

Floating-Point Accrued Exceptions.
-Floating-Point Dynamic Rounding Mode.
-Floating-Point Control and Status Register (frm +fflags).

Unprivileged Zicfiss extension CSR

0x011

URW

ssp

Shadow Stack Pointer.

Unprivileged Counter/Timers

0xC00
-0xC01
-0xC02
-0xC03
-0xC04
-  
-0xC1F
-0xC80
-0xC81
-0xC82
-0xC83
-0xC84

-0xC9F

URO
-URO
-URO
-URO
-URO

-URO
-URO
-URO
-URO
-URO
-URO

-URO

cycle
-time
-instret
-hpmcounter3
-hpmcounter4
-⋮
-hpmcounter31
-cycleh
-timeh
-instreth
-hpmcounter3h
-hpmcounter4h
-⋮
-hpmcounter31h

Cycle counter for RDCYCLE instruction.
-Timer for RDTIME instruction.
-Instructions-retired counter for RDINSTRET instruction.
-Performance-monitoring counter.
-Performance-monitoring counter.

-Performance-monitoring counter.
-Upper 32 bits of cycle, RV32 only.
-Upper 32 bits of time, RV32 only.
-Upper 32 bits of instret, RV32 only.
-Upper 32 bits of hpmcounter3, RV32 only.
-Upper 32 bits of hpmcounter4, RV32 only.

-Upper 32 bits of hpmcounter31, RV32 only.

-
- - ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 5. Currently allocated RISC-V supervisor-level CSR addresses.
NumberPrivilegeNameDescription

Supervisor Trap Setup

0x100
-0x104
-0x105
-0x106

SRW
-SRW
-SRW
-SRW

sstatus
-sie
-stvec
-scounteren

Supervisor status register.
-Supervisor interrupt-enable register.
-Supervisor trap handler base address.
-Supervisor counter enable.

Supervisor Configuration

0x10A

SRW

senvcfg

Supervisor environment configuration register.

Supervisor Counter Setup

0x120

SRW

scountinhibit

Supervisor counter-inhibit register.

Supervisor Trap Handling

0x140
-0x141
-0x142
-0x143
-0x144
-0xDA0

SRW
-SRW
-SRW
-SRW
-SRW
-SRO

sscratch
-sepc
-scause
-stval
-sip
-scountovf

Scratch register for supervisor trap handlers.
-Supervisor exception program counter.
-Supervisor trap cause.
-Supervisor bad address or instruction.
-Supervisor interrupt pending.
-Supervisor count overflow.

Supervisor Protection and Translation

0x180

SRW

satp

Supervisor address translation and protection.

Debug/Trace Registers

0x5A8

SRW

scontext

Supervisor-mode context register.

Supervisor State Enable Registers

0x10C
- 0x10D
- 0x10E
- 0x10F

SRW
- SRW
- SRW
- SRW

sstateen0
- sstateen1
- sstateen2
- sstateen3

Supervisor State Enable 0 Register.
- Supervisor State Enable 1 Register.
- Supervisor State Enable 2 Register.
- Supervisor State Enable 3 Register.

-
- - ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 6. Currently allocated RISC-V hypervisor and VS CSR addresses.
NumberPrivilegeNameDescription

Hypervisor Trap Setup

0x600
-0x602
-0x603
-0x604
-0x606
-0x607
-0x612

HRW
-HRW
-HRW
-HRW
-HRW
-HRW
-HRW

hstatus
-hedeleg
-hideleg
-hie
-hcounteren
-hgeie
-hedelegh

Hypervisor status register.
-Hypervisor exception delegation register.
-Hypervisor interrupt delegation register.
-Hypervisor interrupt-enable register.
-Hypervisor counter enable.
-Hypervisor guest external interrupt-enable register.
-Upper 32 bits of hedeleg, RV32 only.

Hypervisor Trap Handling

0x643
-0x644
-0x645
-0x64A
-0xE12

HRW
-HRW
-HRW
-HRW
-HRO

htval
-hip
-hvip
-htinst
-hgeip

Hypervisor bad guest physical address.
-Hypervisor interrupt pending.
-Hypervisor virtual interrupt pending.
-Hypervisor trap instruction (transformed).
-Hypervisor guest external interrupt pending.

Hypervisor Configuration

0x60A
-0x61A

HRW
-HRM

henvcfg
-henvcfgh

Hypervisor environment configuration register.
-Upper 32 bits of henvcfg, RV32 only.

Hypervisor Protection and Translation

0x680

HRW

hgatp

Hypervisor guest address translation and protection.

Debug/Trace Registers

0x6A8

HRW

hcontext

Hypervisor-mode context register.

Hypervisor Counter/Timer Virtualization Registers

0x605
-0x615

HRW
-HRW

htimedelta
-htimedeltah

Delta for VS/VU-mode timer.
-Upper 32 bits of htimedelta, RV32 only.

Hypervisor State Enable Registers

0x60C
- 0x60D
- 0x60E
- 0x60F
- 0x61C
- 0x61D
- 0x61E
- 0x61F

HRW
- HRW
- HRW
- HRW
- HRW
- HRW
- HRW
- HRW

hstateen0
- hstateen1
- hstateen2
- hstateen3
- hstateen0h
- hstateen1h
- hstateen2h
- hstateen3h

Hypervisor State Enable 0 Register.
- Hypervisor State Enable 1 Register.
- Hypervisor State Enable 2 Register.
- Hypervisor State Enable 3 Register.
- Upper 32 bits of Hypervisor State Enable 0 Register, RV32 only.
- Upper 32 bits of Hypervisor State Enable 1 Register, RV32 only.
- Upper 32 bits of Hypervisor State Enable 2 Register, RV32 only.
- Upper 32 bits of Hypervisor State Enable 3 Register, RV32 only.

Virtual Supervisor Registers

0x200
-0x204
-0x205
-0x240
-0x241
-0x242
-0x243
-0x244
-0x280

HRW
-HRW
-HRW
-HRW
-HRW
-HRW
-HRW
-HRW
-HRW

vsstatus
-vsie
-vstvec
-vsscratch
-vsepc
-vscause
-vstval
-vsip
-vsatp

Virtual supervisor status register.
-Virtual supervisor interrupt-enable register.
-Virtual supervisor trap handler base address.
-Virtual supervisor scratch register.
-Virtual supervisor exception program counter.
-Virtual supervisor trap cause.
-Virtual supervisor bad address or instruction.
-Virtual supervisor interrupt pending.
-Virtual supervisor address translation and protection.

-
- - ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 7. Currently allocated RISC-V machine-level CSR addresses.
NumberPrivilegeNameDescription

Machine Information Registers

0xF11
-0xF12
-0xF13
-0xF14
-0xF15

MRO
-MRO
-MRO
-MRO
-MRO

mvendorid
-marchid
-mimpid
-mhartid
-mconfigptr

Vendor ID.
-Architecture ID.
-Implementation ID.
-Hardware thread ID.
-Pointer to configuration data structure.

Machine Trap Setup

0x300
-0x301
-0x302
-0x303
-0x304
-0x305
-0x306
-0x310
-0x312

MRW
-MRW
-MRW
-MRW
-MRW
-MRW
-MRW
-MRW
-MRW

mstatus
-misa
-medeleg
-mideleg
-mie
-mtvec
-mcounteren
-mstatush
-medelegh

Machine status register.
-ISA and extensions
-Machine exception delegation register.
-Machine interrupt delegation register.
-Machine interrupt-enable register.
-Machine trap-handler base address.
-Machine counter enable.
-Additional machine status register, RV32 only.
-Upper 32 bits of medeleg, RV32 only.

Machine Trap Handling

0x340
-0x341
-0x342
-0x343
-0x344
-0x34A
-0x34B

MRW
-MRW
-MRW
-MRW
-MRW
-MRW
-MRW

mscratch
-mepc
-mcause
-mtval
-mip
-mtinst
-mtval2

Scratch register for machine trap handlers.
-Machine exception program counter.
-Machine trap cause.
-Machine bad address or instruction.
-Machine interrupt pending.
-Machine trap instruction (transformed).
-Machine bad guest physical address.

Machine Configuration

0x30A
-0x31A
-0x747
-0x757

MRW
-MRW
-MRW
-MRW

menvcfg
-menvcfgh
-mseccfg
-mseccfgh

Machine environment configuration register.
-Upper 32 bits of menvcfg, RV32 only.
-Machine security configuration register.
-Upper 32 bits of mseccfg, RV32 only.

Machine Memory Protection

0x3A0
-0x3A1
-0x3A2
-0x3A3

-0x3AE
-0x3AF
-0x3B0
-0x3B1

-0x3EF

MRW
-MRW
-MRW
-MRW

-MRW
-MRW
-MRW
-MRW

-MRW

pmpcfg0
-pmpcfg1
-pmpcfg2
-pmpcfg3
-⋯
-pmpcfg14
-pmpcfg15
-pmpaddr0
-pmpaddr1
-⋯
-pmpaddr63

Physical memory protection configuration.
-Physical memory protection configuration, RV32 only.
-Physical memory protection configuration.
-Physical memory protection configuration, RV32 only.

-Physical memory protection configuration.
-Physical memory protection configuration, RV32 only.
-Physical memory protection address register.
-Physical memory protection address register.

-Physical memory protection address register.

Machine State Enable Registers

0x30C
- 0x30D
- 0x30E
- 0x30F
- 0x31C
- 0x31D
- 0x31E
- 0x31F

MRW
- MRW
- MRW
- MRW
- MRW
- MRW
- MRW
- MRW

mstateen0
- mstateen1
- mstateen2
- mstateen3
- mstateen0h
- mstateen1h
- mstateen2h
- mstateen3h

Machine State Enable 0 Register.
- Machine State Enable 1 Register.
- Machine State Enable 2 Register.
- Machine State Enable 3 Register.
- Upper 32 bits of Machine State Enable 0 Register, RV32 only.
- Upper 32 bits of Machine State Enable 1 Register, RV32 only.
- Upper 32 bits of Machine State Enable 2 Register, RV32 only.
- Upper 32 bits of Machine State Enable 3 Register, RV32 only.

-
- - ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 8. Currently allocated RISC-V machine-level CSR addresses.
NumberPrivilegeNameDescription

Machine Non-Maskable Interrupt Handling

0x740
-0x741
-0x742
-0x744

MRW
-MRW
-MRW
-MRW

mnscratch
-mnepc
-mncause
-mnstatus

Resumable NMI scratch register.
-Resumable NMI program counter.
-Resumable NMI cause.
-Resumable NMI status.

Machine Counter/Timers

0xB00
-0xB02
-0xB03
-0xB04

-0xB1F
-0xB80
-0xB82
-0xB83
-0xB84

-0xB9F

MRW
-MRW
-MRW
-MRW

-MRW
-MRW
-MRW
-MRW
-MRW

-MRW

mcycle
-minstret
-mhpmcounter3
-mhpmcounter4
-⋮
-mhpmcounter31
-mcycleh
-minstreth
-mhpmcounter3h
-mhpmcounter4h
-⋮ -mhpmcounter31h

Machine cycle counter.
-Machine instructions-retired counter.
-Machine performance-monitoring counter.
-Machine performance-monitoring counter.

-Machine performance-monitoring counter.
-Upper 32 bits of mcycle, RV32 only.
-Upper 32 bits of minstret, RV32 only.
-Upper 32 bits of mhpmcounter3, RV32 only.
-Upper 32 bits of mhpmcounter4, RV32 only.

-Upper 32 bits of mhpmcounter31, RV32 only.

Machine Counter Setup

0x320
-0x323
-0x324

-0x33F
-0x723
-0x724

-0x73F

MRW
-MRW
-MRW

-MRW
-MRW
-MRW

-MRW

mcountinhibit
-mhpmevent3
-mhpmevent4
-⋮
-mhpmevent31
-mhpmevent3h
-mhpmevent4h
-⋮
-mhpmevent31h

Machine counter-inhibit register.
-Machine performance-monitoring event selector.
-Machine performance-monitoring event selector.

-Machine performance-monitoring event selector.
-Upper 32 bits of mhpmevent3, RV32 only.
-Upper 32 bits of mhpmevent4, RV32 only.

-Upper 32 bits of mhpmevent31, RV32 only.

Debug/Trace Registers (shared with Debug Mode)

0x7A0
-0x7A1
-0x7A2
-0x7A3
-0x7A8

MRW
-MRW
-MRW
-MRW
-MRW

tselect
-tdata1
-tdata2
-tdata3
-mcontext

Debug/Trace trigger register select.
-First Debug/Trace trigger data register.
-Second Debug/Trace trigger data register.
-Third Debug/Trace trigger data register.
-Machine-mode context register.

Debug Mode Registers

0x7B0
-0x7B1
-0x7B2
-0x7B3

DRW
-DRW
-DRW
-DRW

dcsr
-dpc
-dscratch0
-dscratch1

Debug control and status register.
-Debug program counter.
-Debug scratch register 0.
-Debug scratch register 1.

-
-
-

2.3. CSR Field Specifications

-
-

The following definitions and abbreviations are used in specifying the -behavior of fields within the CSRs.

-
-
-

2.3.1. Reserved Writes Preserve Values, Reads Ignore Values (WPRI)

-
-

Some whole read/write fields are reserved for future use. Software -should ignore the values read from these fields, and should preserve the -values held in these fields when writing values to other fields of the -same register. For forward compatibility, implementations that do not -furnish these fields must make them read-only zero. These fields are -labeled WPRI in the register descriptions.

-
-
- - - - - -
- - -
-

To simplify the software model, any backward-compatible future -definition of previously reserved fields within a CSR must cope with the -possibility that a non-atomic read/modify/write sequence is used to -update other fields in the CSR. Alternatively, the original CSR -definition must specify that subfields can only be updated atomically, -which may require a two-instruction clear bit/set bit sequence in -general that can be problematic if intermediate values are not legal.

-
-
-
-
-
- -
-

Some read/write CSR fields specify behavior for only a subset of -possible bit encodings, with other bit encodings reserved. Software -should not write anything other than legal values to such a field, and -should not assume a read will return a legal value unless the last write -was of a legal value, or the register has not been written since another -operation (e.g., reset) set the register to a legal value. These fields -are labeled WLRL in the register descriptions.

-
-
- - - - - -
- - -
-

Hardware implementations need only implement enough state bits to -differentiate between the supported values, but must always return the -complete specified bit-encoding of any supported value when read.

-
-
-
-
-

Implementations are permitted but not required to raise an -illegal-instruction exception if an instruction attempts to write a -non-supported value to a WLRL field. Implementations can return arbitrary -bit patterns on the read of a WLRL field when the last write was of an -illegal value, but the value returned should deterministically depend on -the illegal written value and the value of the field prior to the write.

-
-
-
- -
-

Some read/write CSR fields are only defined for a subset of bit -encodings, but allow any value to be written while guaranteeing to -return a legal value whenever read. Assuming that writing the CSR has no -other side effects, the range of supported values can be determined by -attempting to write a desired setting then reading to see if the value -was retained. These fields are labeled WARL in the register descriptions.

-
-
-

Implementations will not raise an exception on writes of unsupported -values to a WARL field. Implementations can return any legal value on the -read of a WARL field when the last write was of an illegal value, but the -legal value returned should deterministically depend on the illegal -written value and the architectural state of the hart.

-
-
-
-
-

2.4. CSR Field Modulation

-
-

If a write to one CSR changes the set of legal values allowed for a -field of a second CSR, then unless specified otherwise, the second CSR’s -field immediately gets an UNSPECIFIED value from among its new legal values. This -is true even if the field’s value before the write remains legal after -the write; the value of the field may be changed in consequence of the -write to the controlling CSR.

-
-
- - - - - -
- - -
-

As a special case of this rule, the value written to one CSR may control -whether a field of a second CSR is writable (with multiple legal values) -or is read-only. When a write to the controlling CSR causes the second -CSR’s field to change from previously read-only to now writable, that -field immediately gets an UNSPECIFIED but legal value, unless specified otherwise.

-
-
-
-

Some CSR fields are, when writable, defined as aliases of other CSR -fields. Let x be such a CSR field, and let y be the CSR field it aliases when writable. If a write to a controlling CSR causes field x to change from previously read-only to now writable, the new value of x is not UNSPECIFIED but instead immediately reflects the existing value of its alias y, as required.

-
-
-
-
-

A change to the value of a CSR for this reason is not a write to the -affected CSR and thus does not trigger any side effects specified for -that CSR.

-
-
-
-

2.5. Implicit Reads of CSRs

-
-

Implementations sometimes perform implicit reads of CSRs. (For -example, all S-mode instruction fetches implicitly read the satp CSR.) -Unless otherwise specified, the value returned by an implicit read of a -CSR is the same value that would have been returned by an explicit read -of the CSR, using a CSR-access instruction in a sufficient privilege -mode.

-
-
-
-

2.6. CSR Width Modulation

-
-

If the width of a CSR is changed (for example, by changing SXLEN or -UXLEN, as described in Section 3.1.6.3), the -values of the writable fields and bits of the new-width CSR are, -unless specified otherwise, determined from the previous-width CSR as -though by this algorithm:

-
-
-
    -
  1. -

    The value of the previous-width CSR is copied to a temporary register -of the same width.

    -
  2. -
  3. -

    For the read-only bits of the previous-width CSR, the bits at the same -positions in the temporary register are set to zeros.

    -
  4. -
  5. -

    The width of the temporary register is changed to the new width. If -the new width W is narrower than the previous width, the -least-significant W bits of the temporary register are -retained and the more-significant bits are discarded. If the new width -is wider than the previous width, the temporary register is -zero-extended to the wider width.

    -
  6. -
  7. -

    Each writable field of the new-width CSR takes the value of the bits -at the same positions in the temporary register.

    -
  8. -
-
-
-

Changing the width of a CSR is not a read or write of the CSR and thus -does not trigger any side effects.

-
-
-
-

2.7. Explicit Accesses to CSRs Wider than XLEN

-
-

If a standard CSR is wider than XLEN bits, then an explicit read -of the CSR returns the register’s least-significant XLEN bits, -and an explicit write to the CSR modifies only the register’s -least-significant XLEN bits, leaving the upper bits unchanged.

-
-
-

Some standard CSRs, such as the counter CSRs of extension -Zicntr, are always 64 bits, even when XLEN=32 (RV32). -For each such 64-bit CSR (for example, counter time), -a corresponding 32-bit high-half CSR is usually defined with -the same name but with the letter ‘h’ appended at the end (timeh). -The high-half CSR aliases bits 63:32 of its namesake -64-bit CSR, thus providing a way for RV32 software -to read and modify the otherwise-unreachable 32 bits.

-
-
-

Standard high-half CSRs are accessible only when -the base RISC-V instruction set is RV32 (XLEN=32). -For RV64 (when XLEN=64), the addresses of all standard high-half CSRs -are reserved, so an attempt to access a high-half CSR -typically raises an illegal-instruction exception.

-
-
-
-
-
-

3. Machine-Level ISA, Version 1.13

-
-
-

This chapter describes the machine-level operations available in -machine-mode (M-mode), which is the highest privilege mode in a RISC-V -hart. M-mode is used for low-level access to a hardware platform and -is the first mode entered at reset. M-mode can also be used to implement -features that are too difficult or expensive to implement in hardware -directly. The RISC-V machine-level ISA contains a common core that is -extended depending on which other privilege levels are supported and -other details of the hardware implementation.

-
-
-

3.1. Machine-Level CSRs

-
-

3.1.1. Machine ISA (misa) Register

-
-

The misa CSR is a WARL read-write register reporting the ISA supported by the hart.

-
-
-
-Diagram -
-
Figure 2. Machine ISA register (misa)
-
-
-

[CVA6] The MXL (Machine XLEN) field encodes the native base integer ISA width as -shown in Table 9. The MXL field is read-only. -In CVA6, the misa register returns the MXL field which indicates the -effective XLEN in M-mode, a constant termed MXLEN.

-
- - ---- - - - - - - - - - - - - -
Table 9. Encoding of MXL field in misa
MXLXLEN

1
-2
-3

32
-64
-128

-
-

The misa CSR is MXLEN bits wide.

-
-
-

[CVA6] The Extensions field encodes the presence of the standard extensions, -with a single bit per letter of the alphabet (bit 0 encodes presence of -extension "A" , bit 1 encodes presence of extension "B", through to -bit 25 which encodes "Z"). The "I" bit will be set for RV32I, RV64I, -and RV128I base ISAs, and the "E" bit will be set for RV32E and RV64E. -In CVA6, the Extensions field is not writeable, the presence of standard -extensions corresponds to the hardware reset value and cannot be modified -by writing in the register.

-
- - ----- - - - - - - - - - - - - - - -
Table 10. Encoding of Extensions field in misa. All bits that are reserved for future use must return zero when read.
BitCharacterDescription

0
-1
-2
-3
-4
-5
-6
-7
-8
-9
-10
-11
-12
-13
-14
-15
-16
-17
-18
-19
-20
-21
-22
-23
-24
-25

A
-B
-C
-D
-E
-F
-G
-H
-I
-J
-K
-L
-M
-N
-O
-P
-Q
-R
-S
-T
-U
-V
-W
-X
-Y
-Z

Atomic extension
-B extension
-Compressed extension
-Double-precision floating-point extension
-RV32E/64E base ISA
-Single-precision floating-point extension
-Reserved
-Hypervisor extension
-RV32I/64I/128I base ISA
-Reserved
-Reserved
-Reserved
-Integer Multiply/Divide extension
-Tentatively reserved for User-Level Interrupts extension
-Reserved
-Tentatively reserved for Packed-SIMD extension
-Quad-precision floating-point extension
-Reserved
-Supervisor mode implemented
-Reserved
-User mode implemented
-Vector extension
-Reserved
-Non-standard extensions present
-Reserved
-Reserved

-
-

The "U" and "S" bits will be set if there is support for user and -supervisor modes respectively.

-
-
-

The "X" bit will be set if there are any non-standard extensions.

-
-
-

When "B" bit is 1, the implementation supports the instructions provided by the -Zba, Zbb, and Zbs extensions. When "B" bit is 0, it indicates that the -implementation may not support one or more of the Zba, Zbb, or Zbs extensions.

-
-
-
-

3.1.2. Machine Vendor ID (mvendorid) Register

-
-

[CVA6] The mvendorid CSR is a 32-bit read-only register providing the JEDEC -manufacturer ID of the provider of the core. -In CVA6, mvendorid is implemented and returns the commercial implementation -id supplied to OpenHW Group organization, 0x602.

-
-
-
-Diagram -
-
Figure 3. Vendor ID register (mvendorid)
-
-
-
-

3.1.3. Machine Architecture ID (marchid) Register

-
-

[CVA6] The marchid CSR is an MXLEN-bit read-only register encoding the base -microarchitecture of the hart. -In CVA6, marchid is implemented and returns the base microarchitecture -of the hart supplied to CVA6, 0x3.

-
-
-
-Diagram -
-
Figure 4. Machine Architecture ID (marchid) register
-
-
-
-

3.1.4. Machine Implementation ID (mimpid) Register

-
-

The mimpid CSR provides a unique encoding of the version of the -processor implementation.

-
-
-

[CVA6] The mimpid register is implemented and the return value is TODO. -The Implementation value should reflect the design of the RISC-V -processor itself and not any surrounding system.

-
-
-
-Diagram -
-
Figure 5. Machine Implementation ID (mimpid) register
-
-
-
-

3.1.5. Hart ID (mhartid) Register

-
-

[CV64A6_MMU] The mhartid CSR is an MXLEN-bit read-only register containing the -integer ID of the hardware thread running the code. This register is -readable. In CV64A6_MMU-based system, only one hart is implemented. -Hart ID is zero.

-
-
-
-Diagram -
-
Figure 6. Hart ID (mhartid) register
-
-
-
-

3.1.6. Machine Status (mstatus) Register

-
-

[CV64A6_MMU] The mstatus register is an MXLEN-bit read/write register formatted as -shown in Figure 7. The mstatus register -keeps track of and controls the hart’s current operating state.

-
-
-
-Diagram -
-
Figure 7. Machine-mode status (mstatus) register for RV64
-
-
-
3.1.6.1. Privilege and Global Interrupt-Enable Stack in mstatus register
-
-

Global interrupt-enable bits, MIE and SIE, are provided for M-mode and -S-mode respectively. These bits are primarily used to guarantee -atomicity with respect to interrupt handlers in the current privilege -mode.

-
-
-

When a hart is executing in privilege mode x, interrupts are globally -enabled when xIE=1 and globally disabled when xIE=0. Interrupts for -lower-privilege modes, w<x, are always globally -disabled regardless of the setting of any global wIE bit for the -lower-privilege mode. Interrupts for higher-privilege modes, -y>x, are always globally enabled regardless of the -setting of the global yIE bit for the higher-privilege mode. -Higher-privilege-level code can use separate per-interrupt enable bits -to disable selected higher-privilege-mode interrupts before ceding -control to a lower-privilege mode.

-
-
-

TODO

-
-
-

An MRET or SRET instruction is used to return from a trap in M-mode or -S-mode respectively. When executing an xRET instruction, supposing -xPP holds the value y, xIE is set to xPIE; the privilege mode is -changed to y; xPIE is set to 1; and xPP is set to the -least-privileged supported mode (U if U-mode is implemented, else M). If -y≠M, xRET also sets MPRV=0.

-
-
-

xPP fields are WARL fields that can hold only privilege mode x and any implemented privilege mode lower than x. If privilege mode x is not implemented, then xPP must be read-only 0.

-
-
-
-
3.1.6.2. Double Trap Control in mstatus Register
-
-

[CV64A6_MMU] As Double Trap Control (Smdbltrp extension) is not implemented, -MDT field is read-only 0.

-
-
-
-
3.1.6.3. Base ISA Control in mstatus Register
-
-

[CV64A6_MMU] The SXL and UXL fields are read-only fields that encode the -value of XLEN for S-mode and U-mode, respectively. The encoding of these -fields is the same as the MXL field of misa, shown in Table 9. -The effective XLEN in S-mode and U-mode are termed SXLEN and UXLEN, respectively. -Their values are set to UXLEN=SXLEN=MXLEN.

-
-
-
-
3.1.6.4. Memory Privilege in mstatus Register
-
-

The MPRV (Modify PRiVilege) bit modifies the effective privilege mode, -i.e., the privilege level at which loads and stores execute. When -MPRV=0, loads and stores behave as normal, using the translation and -protection mechanisms of the current privilege mode. When MPRV=1, load -and store memory addresses are translated and protected, and endianness -is applied, as though the current privilege mode were set to MPP. -Instruction address-translation and protection are unaffected by the -setting of MPRV.

-
-
-

An MRET or SRET instruction that changes the privilege mode to a mode -less privileged than M also sets MPRV=0.

-
-
-

The MXR (Make eXecutable Readable) bit modifies the privilege with which -loads access virtual memory. When MXR=0, only loads from pages marked -readable (R=1 in [sv32pte]) will succeed. When -MXR=1, loads from pages marked either readable or executable (R=1 or -X=1) will succeed. MXR has no effect when page-based virtual memory is -not in effect.

-
-
-

The SUM (permit Supervisor User Memory access) bit modifies the -privilege with which S-mode loads and stores access virtual memory. When -SUM=0, S-mode memory accesses to pages that are accessible by U-mode -(U=1 in [sv32pte]) will fault. When SUM=1, these -accesses are permitted. SUM has no effect when page-based virtual memory -is not in effect. Note that, while SUM is ordinarily ignored when not -executing in S-mode, it is in effect when MPRV=1 and MPP=S.

-
-
-

The MXR and SUM mechanisms only affect the interpretation of permissions -encoded in page-table entries. In particular, they have no impact on -whether access-fault exceptions are raised due to PMAs or PMP.

-
-
-
-
3.1.6.5. Endianness Control in mstatus and mstatush Registers
-
-

The MBE, SBE, and UBE bits in mstatus and mstatush are WARL fields that -control the endianness of memory accesses other than instruction -fetches. Instruction fetches are always little-endian.

-
-
-

MBE controls whether non-instruction-fetch memory accesses made from -M-mode (assuming mstatus.MPRV=0) are little-endian (MBE=0) or -big-endian (MBE=1).

-
-
-

SBE controls whether explicit load and store memory accesses made from S-mode are -little-endian (SBE=0) or big-endian (SBE=1).

-
-
-

UBE controls whether explicit load and store memory accesses made from U-mode are -little-endian (UBE=0) or big-endian (UBE=1).

-
-
-

It is always little-endian in M-Mode, the MBE is read-only zero.

-
-
-

It is always little-endian in S-Mode, the SBE is read-only zero.

-
-
-

It is always little-endian in U-Mode, the UBE is read-only zero.

-
-
-
-
3.1.6.6. Virtualization Support in mstatus Register
-
-

The TVM (Trap Virtual Memory) bit is a WARL field that supports intercepting -supervisor virtual-memory management operations. When TVM=1, attempts to -read or write the satp CSR or execute an SFENCE.VMA or SINVAL.VMA -instruction while executing in S-mode will raise an illegal-instruction -exception. When TVM=0, these operations are permitted in S-mode.

-
-
-

The TW (Timeout Wait) bit is a WARL field that supports intercepting the WFI -instruction (see Section 3.3.3). When TW=0, the WFI -instruction may execute in lower privilege modes when not prevented for -some other reason. When TW=1, then if WFI is executed in any -less-privileged mode, and it does not complete within an -implementation-specific, bounded time limit, the WFI instruction causes -an illegal-instruction exception. An implementation may have WFI always -raise an illegal-instruction exception in less-privileged modes when -TW=1, even if there are pending globally-disabled interrupts when the -instruction is executed.

-
-
-

The TSR (Trap SRET) bit is a WARL field that supports intercepting the -supervisor exception return instruction, SRET. When TSR=1, attempts to -execute SRET while executing in S-mode will raise an illegal-instruction -exception. When TSR=0, this operation is permitted in S-mode.

-
-
-
-
3.1.6.7. Extension Context Status in mstatus Register
-
-

Supporting substantial extensions is one of the primary goals of RISC-V, -and hence we define a standard interface to allow unchanged -privileged-mode code, particularly a supervisor-level OS, to support -arbitrary user-mode state extensions.

-
-
-

[CV64A6_MMU] The FS[1:0] and VS[1:0] WARL fields and the XS[1:0] read-only field are used -to reduce the cost of context save and restore by setting and tracking -the current state of the floating-point unit and any other user-mode -extensions respectively.

-
-
-

As the F extension is not implemented, then -FS is read-only zero.

-
-
-

As the v registers is not implemented, then -VS is read-only zero.

-
-
-

As no additional user extensions require new state, the -XS field is read-only zero. TODO

-
-
-

[CV64A6_MMU] The SD bit is a read-only bit that summarizes whether either the FS, VS, -or XS fields signal the presence of some dirty state that will require -saving extended user context to memory.

-
-
-

[CV64A6_MMU] As FS, XS, and VS are all read-only zero, SD is also always -zero.

-
-
-

[CV64A6_MMU] When an extension’s status is set to Off, any instruction that attempts -to read or write the corresponding state will cause an -illegal-instruction exception.

-
-
-
-
3.1.6.8. Previous Expected Landing Pad (ELP) State in mstatus Register
-
-

[CV64A6_MMU] As the Zicfilp extension is not supported, -the SPELP and MPELP fields are read-only zero.

-
-
-
-
-

3.1.7. Machine Trap-Vector Base-Address (mtvec) Register

-
-

The mtvec register is an MXLEN-bit WARL read/write register that holds -trap vector configuration, consisting of a vector base address (BASE) -and a vector mode (MODE).

-
-
-
-Diagram -
-
Figure 8. Encoding of mtvec MODE field.
-
-
-

[CV64A6_MMU] The mtvec register is writable. The value in the BASE field must -always be aligned on a 4-byte boundary. mtvec is always accessed in -Mode=Direct.

-
- - ----- - - - - - - - - - - - - - - -
Table 11. Encoding of mtvec MODE field.
ValueNameDescription

0
-1
-≥2

Direct
-Vectored
----

All traps set pc to BASE.
-Asynchronous interrupts set pc to BASE+4×cause.
-Reserved

-
-

The encoding of the MODE field is shown in -Table 11. When MODE=Direct, all traps into -machine mode cause the pc to be set to the address in the BASE field.

-
-
-
-

3.1.8. Machine Trap Delegation (medeleg and mideleg) Registers

-
-

By default, all traps at any privilege level are handled in machine -mode, though a machine-mode handler can redirect traps back to the -appropriate level with the MRET instruction -(Section 3.3.2). -The machine exception -delegation register (medeleg) is a 64-bit read/write register. -The machine interrupt delegation (mideleg) register is an MXLEN-bit -read/write register. -Setting a bit in medeleg or mideleg will delegate the -corresponding trap, when occurring in S-mode or U-mode, to the S-mode -trap handler.

-
-
-
-Diagram -
-
Figure 9. Machine Exception Delegation (medeleg) register.
-
-
-

medeleg has a bit position allocated for every synchronous exception -shown in Table 12, with the index of the -bit position equal to the value returned in the mcause register (i.e., -setting bit 8 allows user-mode environment calls to be delegated to a -lower-privilege trap handler).

-
-
-

The medelegh register does not exist when XLEN=64.

-
-
-
-Diagram -
-
Figure 10. Machine Interrupt Delegation (mideleg) Register.
-
-
-

mideleg holds trap delegation bits for individual interrupts, with the -layout of bits matching those in the mip register (i.e., STIP -interrupt delegation control is located in bit 5).

-
-
-
-

3.1.9. Machine Interrupt (mip and mie) Registers

-
-

The mip register is an MXLEN-bit read/write register containing -information on pending interrupts, while mie is the corresponding -MXLEN-bit read/write register containing interrupt enable bits. -Interrupt cause number i (as reported in CSR mcause, -Section 3.1.15) corresponds with bit i in both mip and -mie. Bits 15:0 are allocated to standard interrupt causes only, while -bits 16 and above are designated for platform use.

-
-
-
-Diagram -
-
Figure 11. Machine Interrupt-Pending (mip) register.
-
-
-
-Diagram -
-
Figure 12. Machine Interrupt-Enable (mie) register
-
-
-

An interrupt i will trap to M-mode (causing the privilege mode to -change to M-mode) if all of the following are true: (a) either the -current privilege mode is M and the MIE bit in the mstatus register is -set, or the current privilege mode has less privilege than M-mode; -(b) bit i is set in both mip and mie; and (c) bit i is not set in mideleg.

-
-
-

These conditions for an interrupt trap to occur must be evaluated in a -bounded amount of time from when an interrupt becomes, or ceases to be, -pending in mip, and must also be evaluated immediately following the -execution of an xRET instruction or an explicit write to a CSR on -which these interrupt trap conditions expressly depend (including mip, -mie, mstatus, and mideleg).

-
-
-

Interrupts to M-mode take priority over any interrupts to lower -privilege modes.

-
-
-

[CV64A6_MMU] Each individual bit in register mip is read-only. If interrupt i -can become pending but bit i in mip is read-only, the implementation -must provide some other mechanism for clearing the pending interrupt.

-
-
-

[CV64A6_MMU] TODO: A bit in mie must be writable if the corresponding interrupt can ever -become pending. Bits of mie that are not writable must be read-only -zero.

-
-
-

[CV64A6_MMU] The standard portions (bits 15:0) of registers mip and mie are -formatted as shown in Figure 13 and Figure 14 respectively.

-
-
-
-Diagram -
-
Figure 13. Standard portion (bits 15:0) of mip.
-
-
-
-Diagram -
-
Figure 14. Standard portion (bits 15:0) of mie.
-
-
-

Bits mip.MEIP and mie.MEIE are the interrupt-pending and -interrupt-enable bits for machine-level external interrupts. MEIP is -read-only in mip, and is set and cleared by a platform-specific -interrupt controller.

-
-
-

Bits mip.MTIP and mie.MTIE are the interrupt-pending and -interrupt-enable bits for machine timer interrupts. MTIP is read-only in -mip, and is cleared by writing to the memory-mapped machine-mode timer -compare register.

-
-
-

As the system has only one hart then mip.MSIP and mie.MSIE are -read-only zeros.

-
-
-

Bits mip.SEIP and mie.SEIE are -the interrupt-pending and interrupt-enable bits for supervisor-level -external interrupts. SEIP is writable in mip, and may be written by -M-mode software to indicate to S-mode that an external interrupt is -pending.

-
-
-

Bits mip.STIP and mie.STIE are -the interrupt-pending and interrupt-enable bits for supervisor-level -timer interrupts. STIP is writable in mip, and may be written by -M-mode software to deliver timer interrupts to S-mode.

-
-
-

Bits mip.SSIP and mie.SSIE are -the interrupt-pending and interrupt-enable bits for supervisor-level -software interrupts. SSIP is writable in mip and may also be set to 1 -by a platform-specific interrupt controller.

-
-
-

As the Sscofpmf extension is not implemented, mip.LCOFIP and mie.LCOFIE are read-only zeros.

-
-
-

Multiple simultaneous interrupts destined for M-mode are handled in the -following decreasing priority order: MEI, MSI, MTI, SEI, SSI, STI.

-
-
-
-

3.1.10. Hardware Performance Monitor

-
-

M-mode includes a basic hardware performance-monitoring facility. The -mcycle CSR counts the number of clock cycles executed by the processor -core on which the hart is running. The minstret CSR counts the number -of instructions the hart has retired. The mcycle and minstret -registers have 64-bit precision on all RV32 and RV64 harts.

-
-
-

The counter registers have an arbitrary value after the hart is reset, -and can be written with a given value. Any CSR write takes effect after -the writing instruction has otherwise completed. The mcycle CSR may be -shared between harts on the same core, in which case writes to mcycle -will be visible to those harts. The platform should provide a mechanism -to indicate which harts share an mcycle CSR.

-
-
-

[CV64A6_MMU] The hardware performance monitor includes 29 additional 64-bit event -counters, mhpmcounter3-mhpmcounter31. The event selector CSRs, -mhpmevent3-mhpmevent31, are 64-bit WARL registers that control which -event causes the corresponding counter to increment. The meaning of -these events is defined by the platform, but event 0 is defined to mean -"no event." In CV64A6_MMU all counters are implemented, but both the counter and its corresponding event -selector are read-only 0.

-
-
-
-Diagram -
-
Figure 15. Hardware performance monitor counters.
-
-
-

The mhpmcounters are WARL registers that support up to 64 bits of -precision on RV32 and RV64.

-
-
-

As XLEN=64, mcycleh, minstreth, and mhpmcounternh -do not exist.

-
-
-
-

3.1.11. Machine Counter-Enable (mcounteren) Register

-
-

The counter-enable mcounteren register is a 32-bit register that -controls the availability of the hardware performance-monitoring -counters to the next-lower privileged mode.

-
-
-
-Diagram -
-
Figure 16. Counter-enable (mcounteren) register.
-
-
-

The settings in this register only control accessibility. The act of -reading or writing this register does not affect the underlying -counters, which continue to increment even when not accessible.

-
-
-

When the CY, TM, IR, or HPMn bit in the mcounteren register is -clear, attempts to read the cycle, time, instret, or -hpmcountern register while executing in S-mode or U-mode will cause an -illegal-instruction exception. When one of these bits is set, access to -the corresponding register is permitted in the next implemented -privilege mode (S-mode if implemented, otherwise U-mode).

-
-
-
-

3.1.12. Machine Counter-Inhibit (mcountinhibit) Register

-
-
-Diagram -
-
Figure 17. Counter-inhibit mcountinhibit register
-
-
-

[CV64A6_MMU] The mcountinhibit register is not implemented, the implementation -behaves as though the register were set to zero.

-
-
-
-

3.1.13. Machine Scratch (mscratch) Register

-
-

The mscratch register is an MXLEN-bit read/write register dedicated -for use by machine mode. Typically, it is used to hold a pointer to a -machine-mode hart-local context space and swapped with a user register -upon entry to an M-mode trap handler.

-
-
-
-Diagram -
-
Figure 18. Machine-mode scratch register.
-
-
-
-

3.1.14. Machine Exception Program Counter (mepc) Register

-
-

mepc is an MXLEN-bit read/write register formatted as shown in -Figure 19. The low bit of mepc (mepc[0]) is -always zero.

-
-
-

mepc is a WARL register that must be able to hold all valid virtual -addresses. It need not be capable of holding all possible invalid -addresses. Prior to writing mepc, implementations may convert an -invalid address into some other invalid address that mepc is capable -of holding.

-
-
-

When a trap is taken into M-mode, mepc is written with the virtual -address of the instruction that was interrupted or that encountered the -exception. Otherwise, mepc is never written by the implementation, -though it may be explicitly written by software.

-
-
-
-Diagram -
-
Figure 19. Machine exception program counter register.
-
-
-
-

3.1.15. Machine Cause (mcause) Register

-
-

The mcause register is an MXLEN-bit read-write register formatted as -shown in Figure 20. When a trap is taken into -M-mode, mcause is written with a code indicating the event that -caused the trap. Otherwise, mcause is never written by the -implementation, though it may be explicitly written by software.

-
-
-

The Interrupt bit in the mcause register is set if the trap was caused -by an interrupt. The Exception Code field contains a code identifying -the last exception or interrupt. Table 12 lists -the possible machine-level exception codes. The Exception Code is a -WLRL field, so is only guaranteed to hold supported exception codes.

-
-
-
-Diagram -
-
Figure 20. Machine Cause (mcause) register.
-
-
-

Note that load and load-reserved instructions generate load exceptions, -whereas store, store-conditional, and AMO instructions generate -store/AMO exceptions.

-
-
-

[CV64A6_MMU] Note that load and load-reserved instructions generate load exceptions, -whereas store and store-conditional instructions generate -store exceptions.

-
-
-

[CVA6] If an instruction may raise multiple synchronous exceptions, the -decreasing priority order of -Table 13 indicates which -exception is taken and reported in mcause. The priority of any custom -synchronous exceptions is implementation-defined. TODO

-
-
- - ----- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 12. Machine cause (mcause) register values after trap.
InterruptException CodeDescription

1
-1
-1
-1

0
-1
-2
-3

Reserved
-Supervisor software interrupt
-Reserved
-Machine software interrupt

1
-1
-1
-1

4
-5
-6
-7

Reserved
-Supervisor timer interrupt
-Reserved
-Machine timer interrupt

1
-1
-1
-1

8
-9
-10
-11

Reserved
-Supervisor external interrupt
-Reserved
-Machine external interrupt

1
-1
-1
-1

12
-13
-14-15
-≥16

Reserved
-Counter-overflow interrupt
-Reserved
-Designated for platform use

0
-0
-0
-0
-0
-0
-0
-0
-0
-0
-0
-0
-0
-0
-0
-0
-0
-0
-0
-0
-0
-0

0
-1
-2
-3
-4
-5
-6
-7
-8
-9
-10
-11
-12
-13
-14
-15
-16
-17
-18
-19
-20-23
-24-31
-32-47
-48-63
-≥64

Instruction address misaligned
-Instruction access fault
-Illegal instruction
-Breakpoint
-Load address misaligned
-Load access fault
-Store/AMO address misaligned
-Store/AMO access fault
-Environment call from U-mode
-Environment call from S-mode
-Reserved
-Environment call from M-mode
-Instruction page fault
-Load page fault
-Reserved
-Store/AMO page fault
-Double trap
-Reserved
-Software check
-Hardware error
-Reserved
-Designated for custom use
-Reserved
-Designated for custom use
-Reserved

-
- - ----- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 13. Synchronous exception priority in decreasing priority order.
PriorityExc.CodeDescription

Highest

3

Instruction address breakpoint

12, 1

During instruction address translation:
-First encountered page fault or access fault

1

With physical address for instruction:
-Instruction access fault

2
-0
-8,9,11
-3
-3

Illegal instruction
-Instruction address misaligned
-Environment call
-Environment break
-Load/store/AMO address breakpoint

4,6

Optionally:
-Load/store/AMO address misaligned

13, 15, 5, 7

During address translation for an explicit memory access:
-First encountered page fault or access fault

5,7

With physical address for an explicit memory access:
-Load/store/AMO access fault

Lowest

4,6

If not higher priority:
-Load/store/AMO address misaligned

-
-

[CV64A6_MMU] Load/store address-misaligned exceptions may have either higher or -lower priority than load/store access-fault -exceptions. TODO

-
-
-
-

3.1.16. Machine Trap Value (mtval) Register

-
-

[CV64A6_MMU] The mtval register is an MXLEN-bit read-write register -holding constant value zero.

-
-
-
-

3.1.17. Machine Configuration Pointer (mconfigptr) Register

-
-

The mconfigptr register is an MXLEN-bit read-only CSR that holds the physical -address of a configuration data structure.

-
-
-

[CV64A6_MMU] The mconfigptr register is implemented, but it is read-only 0 to indicate the -configuration data structure does not exist.

-
-
-
-

3.1.18. Machine Environment Configuration (menvcfg) Register

-
-

The menvcfg CSR is a 64-bit read/write register, formatted -as shown in Figure 21, that controls -certain characteristics of the execution environment for modes less -privileged than M.

-
-
-
-Diagram -
-
Figure 21. Machine environment configuration (menvcfg) register.
-
-
-

If bit FIOM (Fence of I/O implies Memory) is set to one in menvcfg, -FENCE instructions executed in modes less privileged than M are modified -so the requirement to order accesses to device I/O implies also the -requirement to order main memory accesses. Table 14 -details the modified interpretation of FENCE instruction bits PI, PO, -SI, and SO for modes less privileged than M when FIOM=1.

-
-
-

Similarly, for modes less privileged than M when FIOM=1, if an atomic -instruction that accesses a region ordered as device I/O has its aq -and/or rl bit set, then that instruction is ordered as though it -accesses both device I/O and memory.

-
-
-

If S-mode is not supported, or if satp.MODE is read-only zero (always -Bare), the implementation may make FIOM read-only zero.

-
- - ---- - - - - - - - - - - - - - - - - -
Table 14. Modified interpretation of FENCE predecessor and successor sets for modes less privileged than M when FIOM=1.
Instruction bitMeaning when set

PI
-PO

Predecessor device input and memory reads (PR implied)
-Predecessor device output and memory writes (PW implied)

SI
-SO

Successor device input and memory reads (SR implied)
-Successor device output and memory writes (SW implied)

-
-

The PBMTE bit controls whether the Svpbmt extension is available for use -in S-mode and G-stage address translation (i.e., for page tables pointed -to by satp or hgatp).

-
-
-

[CV64A6_MMU] As Svpbmt is not implemented, PBMTE is always 0

-
-
-

The ADUE bit controls whether hardware -updating of PTE A/D bits is enabled for S-mode and G-stage address -translations.

-
-
-

[CV64A6_MMU] As Svadu is not implemented, ADUE is always 0

-
-
-

The CDE (Counter Delegation Enable) bit controls whether Zicntr and Zihpm counters can be delegated to S-mode.

-
-
-

[CV64A6_MMU] As Smcdeleg is not implemented, CDE is always 0

-
-
-

The definition of the STCE field is furnished by the Sstc extension.

-
-
-

[CV64A6_MMU] As Sstc is not implemented, STCE is always 0

-
-
-

The definition of the CBZE field is furnished by the Zicboz extension.

-
-
-

[CV64A6_MMU] As Zicboz is not implemented, CBZE is always 0

-
-
-

The definitions of the CBCFE and CBIE fields are furnished by the Zicbom extension.

-
-
-

[CV64A6_MMU] As Zicbom is not implemented, CBCFE and CBIE fields are always 0

-
-
-

The definition of the PMM field will be furnished by the forthcoming -Smnpm extension. Its allocation within menvcfg may change prior to the -ratification of that extension.

-
-
-

[CV64A6_MMU] As Smnpm is not implemented, PMM field is always 0

-
-
-

[CV64A6_MMU] As Zicfilp is not implemented, LPE field is always 0

-
-
-

[CV64A6_MMU] As Zicfiss is not implemented, SSE field is always 0

-
-
-
-

3.1.19. Machine Security Configuration (mseccfg) Register

-
-

mseccfg is an optional 64-bit read/write register, -that controls security features.

-
-
-

As XLEN=64, register mseccfgh does not exist.

-
-
-

[CV64A6_MMU] As Zkr, Smepmp, and Smmpm extensions are not implemented, -mseccfg and mseccfgh do not exist. TODO.

-
-
-
-
-

3.2. Machine-Level Memory-Mapped Registers

-
-

3.2.1. Machine Timer (mtime and mtimecmp) Registers

-
-

Platforms provide a real-time counter, exposed as a memory-mapped -machine-mode read-write register, mtime. mtime must increment at -constant frequency, and the platform must provide a mechanism for -determining the period of an mtime tick. The mtime register will -wrap around if the count overflows.

-
-
-

The mtime register has a 64-bit precision on all RV32 and RV64 -systems. Platforms provide a 64-bit memory-mapped machine-mode timer -compare register (mtimecmp). A machine timer interrupt becomes pending -whenever mtime contains a value greater than or equal to mtimecmp, -treating the values as unsigned integers. The interrupt remains posted -until mtimecmp becomes greater than mtime (typically as a result of -writing mtimecmp). The interrupt will only be taken if interrupts are -enabled and the MTIE bit is set in the mie register.

-
-
-
-Diagram -
-
Figure 22. Machine time register (memory-mapped control register).
-
-
-
-Diagram -
-
Figure 23. Machine time compare register (memory-mapped control register).
-
-
-

If the result of the comparison between mtime and mtimecmp changes, it is -guaranteed to be reflected in MTIP eventually, but not necessarily -immediately.

-
-
-

For RV64, naturally aligned 64-bit memory accesses to the mtime and -mtimecmp registers are additionally supported and are atomic.

-
-
-
-
-

3.3. Machine-Mode Privileged Instructions

-
-

3.3.1. Environment Call and Breakpoint

-
-
-Diagram -
-
-
-

The ECALL instruction is used to make a request to the supporting -execution environment. When executed in U-mode, S-mode, or M-mode, it -generates an environment-call-from-U-mode exception, -environment-call-from-S-mode exception, or environment-call-from-M-mode -exception, respectively, and performs no other operation.

-
-
-

The EBREAK instruction is used by debuggers to cause control to be -transferred back to a debugging environment. -Unless overridden by an external debug environment, EBREAK raises -a breakpoint exception and performs no other operation.

-
-
-

ECALL and EBREAK cause the receiving privilege mode’s epc register to -be set to the address of the ECALL or EBREAK instruction itself, not -the address of the following instruction. As ECALL and EBREAK cause -synchronous exceptions, they are not considered to retire, and should -not increment the minstret CSR.

-
-
-
-

3.3.2. Trap-Return Instructions

-
-

Instructions to return from trap are encoded under the PRIV minor -opcode.

-
-
-
-Diagram -
-
-
-

To return after handling a trap, there are separate trap return -instructions per privilege level, MRET and SRET. MRET is always -provided. SRET must be provided if supervisor mode is supported, and -should raise an illegal-instruction exception otherwise. SRET should -also raise an illegal-instruction exception when TSR=1 in mstatus, as -described in Section 3.1.6.6. An xRET instruction -can be executed in privilege mode x or higher, where executing a -lower-privilege xRET instruction will pop the relevant lower-privilege -interrupt enable and privilege mode stack. In addition to manipulating -the privilege stack as described in Section 3.1.6.1, -xRET sets the pc to the value stored in the xepc register.

-
-
-

If the A extension is supported, the xRET instruction is allowed to -clear any outstanding LR address reservation but is not required to. -Trap handlers should explicitly clear the reservation if required (e.g., -by using a dummy SC) before executing the xRET.

-
-
-
-

3.3.3. Wait for Interrupt

-
-

The Wait for Interrupt instruction (WFI) informs the -implementation that the current hart can be stalled until an interrupt -might need servicing. Execution of the WFI instruction can also be used -to inform the hardware platform that suitable interrupts should -preferentially be routed to this hart. WFI is available in all -privileged modes, and optionally available to U-mode. This instruction -may raise an illegal-instruction exception when TW=1 in mstatus, as -described in Section 3.1.6.6.

-
-
-
-Diagram -
-
-
-

If an enabled interrupt is present or later becomes present while the -hart is stalled, the interrupt trap will be taken on the following -instruction, i.e., execution resumes in the trap handler and mepc = -pc + 4.

-
-
-

Implementations are permitted to resume execution for any reason, even if an -enabled interrupt has not become pending. Hence, a legal implementation is to -simply implement the WFI instruction as a NOP.

-
-
-

The WFI instruction can also be executed when interrupts are disabled. -The operation of WFI must be unaffected by the global interrupt bits in -mstatus (MIE and SIE) and the delegation register mideleg (i.e., -the hart must resume if a locally enabled interrupt becomes pending, -even if it has been delegated to a less-privileged mode), but should -honor the individual interrupt enables (e.g, MTIE) (i.e., -implementations should avoid resuming the hart if the interrupt is -pending but not individually enabled). WFI is also required to resume -execution for locally enabled interrupts pending at any privilege level, -regardless of the global interrupt enable at each privilege level.

-
-
-

If the event that causes the hart to resume execution does not cause an -interrupt to be taken, execution will resume at pc + 4, and software -must determine what action to take, including looping back to repeat the -WFI if there was no actionable event.

-
-
-
-

3.3.4. Custom SYSTEM Instructions

-
-

The subspace of the SYSTEM major opcode shown in Figure 24 is designated for custom use. It is recommended that these instructions use bits 29:28 to designate the -minimum required privilege mode, as do other SYSTEM instructions.

-
-
-
-Diagram -
-
Figure 24. SYSTEM instruction encodings designated for custom use.
-
-
-
-
-

3.4. Reset

-
-

[CV64A6_MMU] Upon reset, a hart’s privilege mode is set to M. The mstatus fields -MIE and MPRV are reset to 0 -As little-endian memory accesses are supported, -the mstatus field MBE is reset to 0. -Upon reset, the mstatus fields MIE and MPRV are reset to 0. -The misa register is set as described in Section 3.1.1. -The pc is set to 0x80000000 reset vector. TODO -The mcause register is set to a value indicating the cause of the reset. -Writable PMP registers’ A and L fields are set to 0. -No WARL field contains an illegal value. All other hart state is UNSPECIFIED.

-
-
-

As "CV64A6_MMU" does not distinguished different reset conditions, -The mcause returns 0 after reset.

-
-
-
-

3.5. Non-Maskable Interrupts

-
-

Non-maskable interrupts (NMIs) are only used for hardware error -conditions, and cause an immediate jump to an implementation-defined NMI -vector running in M-mode regardless of the state of a hart’s interrupt -enable bits. The mepc register is written with the virtual address of -the instruction that was interrupted, and mcause is set to a value -indicating the source of the NMI. The NMI can thus overwrite state in an -active machine-mode interrupt handler.

-
-
-

[CV64A6_MMU] Upon NMI, the high Interrupt bit of mcause is set to indicate -that this was an interrupt. As CV64A6_MMU does not distinguish sources -of NMIs, the mcause register returns 0 in the Exception Code.

-
-
-

Unlike resets, NMIs do not reset processor state, enabling diagnosis, -reporting, and possible containment of the hardware error.

-
-
-
-

3.6. Physical Memory Attributes

-
-

The physical memory map for a complete system includes various address -ranges, some corresponding to memory regions and some to memory-mapped -control registers, portions of which might not be accessible. Some -memory regions might not support reads, writes, or execution; some might -not support subword or subblock accesses; some might not support atomic -operations; and some might not support cache coherence or might have -different memory models. Similarly, memory-mapped control registers vary -in their supported access widths, support for atomic operations, and -whether read and write accesses have associated side effects. In RISC-V -systems, these properties and capabilities of each region of the -machine’s physical address space are termed physical memory attributes -(PMAs). This section describes RISC-V PMA terminology and how RISC-V -systems implement and check PMAs.

-
-
-

[CV64A6_MMU] PMAs are inherent properties of the underlying hardware. The PMAs of -some memory regions are fixed at chip design time.

-
-
-

[CV64A6_MMU] Some PMAs are dynamically -checked in hardware later in the execution pipeline after the physical -address is known, as some operations will not be supported at all -physical memory addresses, and some operations require knowing the -setting of a PMA attribute.

-
-
-

[CV64A6_MMU] For RISC-V, we separate out specification and checking of PMAs into a -separate hardware structure, the PMA checker. In CV64A6_MMU, the -attributes are known at system design time for each physical address -region, and are hardwired into the PMA checker. -PMAs are checked for any access to physical memory, including accesses -that have undergone virtual to physical memory translation. To aid in -system debugging, we strongly recommend that, where possible, RISC-V -processors precisely trap physical memory accesses that fail PMA checks. -Precisely trapped PMA violations manifest as instruction, load, or store -access-fault exceptions, distinct from virtual-memory page-fault -exceptions. Precise PMA traps might not always be possible, for example, -when probing a legacy bus architecture that uses access failures as part -of the discovery mechanism. In this case, error responses from -peripheral devices will be reported as imprecise bus-error interrupts.

-
-
-

[CV64A6_MMU] PMAs are not readable by software.

-
-
-

3.6.1. Main Memory versus I/O Regions

-
-

The most important characterization of a given memory address range is -whether it holds regular main memory or I/O devices. -Regular main memory is required to have a number of properties, -specified below, whereas I/O devices can have a much broader range of -attributes. Memory regions that do not fit into regular main memory, for -example, device scratchpad RAMs, are categorized as I/O regions.

-
-
- - - - - -
- - -What previous versions of this specification termed vacant regions are -no longer a distinct category; they are now described as I/O regions that are -not accessible (i.e. lacking read, write, and execute permissions). -Main memory regions that are not accessible are also allowed. -
-
-
-
-

3.6.2. Supported Access Type PMAs

-
-

Access types specify which access widths, from 8-bit byte to long -multi-word burst, are supported, and also whether misaligned accesses -are supported for each access width.

-
-
-

Main memory regions always support read and write of all access widths -required by the attached devices, and can specify whether instruction -fetch is supported.

-
-
-

I/O regions can specify which combinations of read, write, or execute -accesses to which data widths are supported.

-
-
-

For systems with page-based virtual memory, I/O and memory regions can -specify which combinations of hardware page-table reads and hardware -page-table writes are supported.

-
-
-
-

3.6.3. Atomicity PMAs

-
-

[CV64A6_MMU] Atomic extension is not implemented.

-
-
-
3.6.3.1. AMO PMA
-
-

[CV64A6_MMU] Atomic extension is not implemented.

-
-
-
-
3.6.3.2. Reservability PMA
-
-

[CV64A6_MMU] Atomic extension is not implemented.

-
-
-
-
-

3.6.4. Misaligned Atomicity Granule PMA

-
-

[CV64A6_MMU] Atomic extension is not implemented.

-
-
-
-

3.6.5. Memory-Ordering PMAs

-
-

[CV64A6_MMU] As CV64A6_MMU is dedicated to a one hart -platform without any DMA, no memory-ordering mechanism is implemented.

-
-
-
-

3.6.6. Coherence and Cacheability PMAs

-
-

[CV64A6_MMU] Write accesses are not cached. No cache-coherence scheme -is implemented.

-
-
-

If a PMA indicates non-cacheability, then accesses to that region must -be satisfied by the memory itself, not by any caches.

-
-
-
-

3.6.7. Idempotency PMAs

-
-

Idempotency PMAs describe whether reads and writes to an address region -are idempotent. Main memory regions are assumed to be idempotent. For -I/O regions, idempotency on reads and writes can be specified separately -(e.g., reads are idempotent but writes are not). If accesses are -non-idempotent, i.e., there is potentially a side effect on any read or -write access, then speculative or redundant accesses must be avoided.

-
-
-

For the purposes of defining the idempotency PMAs, changes in observed -memory ordering created by redundant accesses are not considered a side -effect.

-
-
-

For non-idempotent regions, implicit reads and writes must not be -performed early or speculatively, with the following exceptions. When a -non-speculative implicit read is performed, an implementation is -permitted to additionally read any of the bytes within a naturally -aligned power-of-2 region containing the address of the non-speculative -implicit read. Furthermore, when a non-speculative instruction fetch is -performed, an implementation is permitted to additionally read any of -the bytes within the next naturally aligned power-of-2 region of the -same size (with the address of the region taken modulo -2XLEN. The results of these additional reads -may be used to satisfy subsequent early or speculative implicit reads. -The size of these naturally aligned power-of-2 regions is -implementation-defined, but, for systems with page-based virtual memory, -must not exceed the smallest supported page size.

-
-
-
-
-

3.7. Physical Memory Protection

-
-

To support secure processing and contain faults, it is desirable to -limit the physical addresses accessible by software running on a hart. -An optional physical memory protection (PMP) unit provides per-hart -machine-mode control registers to allow physical memory access -privileges (read, write, execute) to be specified for each physical -memory region. The PMP values are checked in parallel with the PMA -checks described in Section 3.6.

-
-
-

The granularity of PMP access control settings are platform-specific, -but the standard PMP encoding supports regions as small as four bytes. -Certain regions’ privileges can be hardwired—for example, some regions -might only ever be visible in machine mode but in no lower-privilege -layers.

-
-
-

PMP checks are applied to all accesses whose effective privilege mode is -S or U, including instruction fetches and data accesses in S and U mode, -and data accesses in M-mode when the MPRV bit in mstatus is set and -the MPP field in mstatus contains S or U. PMP checks are also applied -to page-table accesses for virtual-address translation, for which the -effective privilege mode is S. Optionally, PMP checks may additionally -apply to M-mode accesses, in which case the PMP registers themselves are -locked, so that even M-mode software cannot change them until the hart -is reset. In effect, PMP can grant permissions to S and U modes, which -by default have none, and can revoke permissions from M-mode, which by -default has full permissions.

-
-
-

PMP violations are always trapped precisely at the processor.

-
-
-

3.7.1. Physical Memory Protection CSRs

-
-

PMP entries are described by an 8-bit configuration register and one -MXLEN-bit address register. Some PMP settings additionally use the -address register associated with the preceding PMP entry. 64 PMP -entries are implemented. The lowest-numbered PMP entries must be -implemented first. All PMP CSR fields are WARL and 56 upper entries are -read-only zero. PMP CSRs are only accessible to M-mode.

-
-
-

[CV64A6_MMU] The PMP configuration registers are densely packed into CSRs to minimize -context-switch time. For CV64A6_MMU with sixty four CSRs, pmpcfg0pmpcfg15, hold -the configurations as shown -in Figure 25. -The 14 upper entries are read-only zero.

-
-
-
-Diagram -
-
Figure 25. RV32 PMP configuration CSR layout.
-
-
-

[CV64A6_MMU] The PMP address registers are CSRs named pmpaddr0-pmpaddr63. Each -PMP address register encodes bits 33-2 of a 34-bit physical address for -RV32, as shown in Figure 26. Not all -physical address bits may be implemented, and so the pmpaddr registers -are WARL.

-
-
-
-Diagram -
-
Figure 26. PMP address register format, RV32.
-
-
-

Figure 27 shows the layout of a PMP configuration -register. The R, W, and X bits, when set, indicate that the PMP entry -permits read, write, and instruction execution, respectively. When one -of these bits is clear, the corresponding access type is denied. The R, -W, and X fields form a collective WARL field for which the combinations with R=0 and W=1 are reserved. The remaining two fields, A and L, are described in the following sections.

-
-
-
-Diagram -
-
Figure 27. PMP configuration register format.
-
-
-

Attempting to fetch an instruction from a PMP region that does not have -execute permissions raises an instruction access-fault exception. -Attempting to execute a load or load-reserved instruction which accesses -a physical address within a PMP region without read permissions raises a -load access-fault exception. Attempting to execute a store, -store-conditional, or AMO instruction which accesses a physical address -within a PMP region without write permissions raises a store -access-fault exception.

-
-
-
3.7.1.1. Address Matching
-
-

The A field in a PMP entry’s configuration register encodes the -address-matching mode of the associated PMP address register. The -encoding of this field is shown in Table 15.

-
-
-

When A=0, this PMP entry is disabled and matches no addresses. Two other -address-matching modes are supported: naturally aligned power-of-2 -regions (NAPOT), including the special case of naturally aligned -four-byte regions (NA4); and the top boundary of an arbitrary range -(TOR). These modes support four-byte granularity.

-
-
-

[CV64A6_MMU] Two address-matching modes are supported: disabled and TOR.

-
- - ----- - - - - - - - - - - - - - - -
Table 15. Encoding of A field in PMP configuration registers.
ANameDescription

0
-1
-2
-3

OFF
-TOR
-NA4
-NAPOT

Null region (disabled)
-Top of range
-Not supported
-Not supported

-
-

If TOR is selected, the associated address register forms the top of the -address range, and the preceding PMP address register forms the bottom -of the address range. If PMP entry i's A field is set to -TOR, the entry matches any address y such that pmpaddri-1y<pmpaddri (irrespective of the value of pmpcfgi-1). If PMP entry 0’s A field is set to TOR, zero is used for the lower bound, and so it matches -any address y<pmpaddr0.

-
-
-

[CV64A6_MMU] Although the PMP mechanism supports regions as small as four bytes, -platforms may specify coarser PMP regions. In general, the PMP grain is -stem 50c2d22972bd4d3042e2106e11a4f768 bytes and must be the same across all PMP regions. -When stem 455f095ec98c57486370d8897c063d21 and -stem a0acc94b70acb4e7dd3c8c0039ee033e.A[1] is clear, i.e. the mode is OFF or TOR, -then bits stem 7fb2b94c3cf4f2fa76e8b8950724e8d0[G-1:0] read as all zeros. Bits -stem 7fb2b94c3cf4f2fa76e8b8950724e8d0[G-1:0] do not affect the TOR address-matching -logic.

-
-
-

If the current XLEN is greater than MXLEN, the PMP address registers are -zero-extended from MXLEN to XLEN bits for the purposes of address -matching.

-
-
-
-
3.7.1.2. Locking and Privilege Mode
-
-

The L bit indicates that the PMP entry is locked, i.e., writes to the -configuration register and associated address registers are ignored. -Locked PMP entries remain locked until the hart is reset. If PMP entry -i is locked, writes to pmpicfg and pmpaddri are ignored. Additionally, if PMP -entry i is locked and pmpicfg.A is set -to TOR, writes to pmpaddri-1 are ignored.

-
-
-

In addition to locking the PMP entry, the L bit indicates whether the -R/W/X permissions are enforced on M-mode accesses. When the L bit is -set, these permissions are enforced for all privilege modes. When the L -bit is clear, any M-mode access matching the PMP entry will succeed; the -R/W/X permissions apply only to S and U modes.

-
-
-
-
3.7.1.3. Priority and Matching Logic
-
-

PMP entries are statically prioritized. The lowest-numbered PMP entry -that matches any byte of an access determines whether that access -succeeds or fails. The matching PMP entry must match all bytes of an -access, or the access fails, irrespective of the L, R, W, and X bits. -For example, if a PMP entry is configured to match the four-byte range -0xC0xF, then an 8-byte access to the range 0x80xF will fail, -assuming that PMP entry is the highest-priority entry that matches those -addresses.

-
-
-

If a PMP entry matches all bytes of an access, then the L, R, W, and X -bits determine whether the access succeeds or fails. If the L bit is -clear and the privilege mode of the access is M, the access succeeds.

-
-
-

Otherwise, if the L bit is set or the privilege mode of the access is S -or U, then the access succeeds only if the R, W, or X bit corresponding -to the access type is set.

-
-
-

If no PMP entry matches an M-mode access, the access succeeds. If no PMP -entry matches an S-mode or U-mode access, but at least one PMP entry is -implemented, the access fails.

-
-
-

Failed accesses generate an instruction, load, or store access-fault -exception. Note that a single instruction may generate multiple -accesses, which may not be mutually atomic. An access-fault exception is -generated if at least one access generated by an instruction fails, -though other accesses generated by that instruction may succeed with -visible side effects. Notably, instructions that reference virtual -memory are decomposed into multiple accesses.

-
-
-

On some implementations, misaligned loads, stores, and instruction -fetches may also be decomposed into multiple accesses, some of which may -succeed before an access-fault exception occurs. In particular, a -portion of a misaligned store that passes the PMP check may become -visible, even if another portion fails the PMP check. The same behavior -may manifest for stores wider than XLEN bits (e.g., the FSD instruction -in RV32D), even when the store address is naturally aligned.

-
-
-
-
-

3.7.2. Physical Memory Protection and Paging

- -
-
-
-
-
-

4. "Smstateen/Ssstateen" Extensions, Version 1.0

-
-
-

CV64A6_MMU: This extension is not supported.

-
-
-
-
-

5. "Smcsrind/Sscsrind" Indirect CSR Access, Version 1.0

-
-
-

CV64A6_MMU: This extension is not supported.

-
-
-
-
-

6. "Smepmp" Extension for PMP Enhancements for memory access and execution prevention in Machine mode, Version 1.0

-
-
-

CV64A6_MMU: This extension is not supported.

-
-
-
-
-

7. "Smcntrpmf" Cycle and Instret Privilege Mode Filtering, Version 1.0

-
-
-

CV64A6_MMU: This extension is not supported.

-
-
-
-
-

8. "Smrnmi" Extension for Resumable Non-Maskable Interrupts, Version 0.5

-
-
-

CV64A6_MMU: This extension is not supported.

-
-
-
-
-

9. "Smcdeleg" Counter Delegation Extension, Version 1.0

-
-
-

CV64A6_MMU: This extension is not supported.

-
-
-
-
-

10. "Smdbltrp" Double Trap Extension, Version 1.0

-
- -
-
-
-

11. Supervisor-Level ISA, Version 1.13

-
-
-

This chapter describes the RISC-V supervisor-level architecture, which -contains a common core that is used with various supervisor-level -address translation and protection schemes.

-
-
-

11.1. Supervisor CSRs

-
-

A number of CSRs are provided for the supervisor.

-
-
-

11.1.1. Supervisor Status (sstatus) Register

-
-

The sstatus register is an SXLEN-bit read/write register formatted as -shown in Figure 28. The sstatus -register keeps track of the processor’s current operating state.

-
-
-
-Diagram -
-
Figure 28. Supervisor-mode status (sstatus) register when SXLEN=64.
-
-
-

The SPP bit indicates the privilege level at which a hart was executing -before entering supervisor mode. When a trap is taken, SPP is set to 0 -if the trap originated from user mode, or 1 otherwise. When an SRET -instruction (see Section 3.3.2) is executed to -return from the trap handler, the privilege level is set to user mode if -the SPP bit is 0, or supervisor mode if the SPP bit is 1; SPP is then -set to 0.

-
-
-

The SIE bit enables or disables all interrupts in supervisor mode. When -SIE is clear, interrupts are not taken while in supervisor mode. When -the hart is running in user-mode, the value in SIE is ignored, and -supervisor-level interrupts are enabled. The supervisor can disable -individual interrupt sources using the sie CSR.

-
-
-

The SPIE bit indicates whether supervisor interrupts were enabled prior -to trapping into supervisor mode. When a trap is taken into supervisor -mode, SPIE is set to SIE, and SIE is set to 0. When an SRET instruction -is executed, SIE is set to SPIE, then SPIE is set to 1.

-
-
-

The sstatus register is a subset of the mstatus register.

-
-
-
11.1.1.1. Base ISA Control in sstatus Register
-
-

[CV64A6_MMU] The UXL field is a read-only field that encode the -value of XLEN for S-mode. The encoding of this -field is the same as the MXL field of misa, shown in Table 9. -The effective XLEN in S-mode is termed SXLEN. -Its value is set to SXLEN=MXLEN.

-
-
-
-
11.1.1.2. Memory Privilege in sstatus Register
-
-

The MXR (Make eXecutable Readable) bit modifies the privilege with which -loads access virtual memory. When MXR=0, only loads from pages marked -readable (R=1 in [sv32pte]) will succeed. When -MXR=1, loads from pages marked either readable or executable (R=1 or -X=1) will succeed. MXR has no effect when page-based virtual memory is -not in effect.

-
-
-

The SUM (permit Supervisor User Memory access) bit modifies the -privilege with which S-mode loads and stores access virtual memory. When -SUM=0, S-mode memory accesses to pages that are accessible by U-mode -(U=1 in [sv32pte]) will fault. When SUM=1, these -accesses are permitted. SUM has no effect when page-based virtual memory -is not in effect, nor when executing in U-mode. Note that S-mode can -never execute instructions from user pages, regardless of the state of -SUM.

-
-
-
-
11.1.1.3. Endianness Control in sstatus Register
-
-

UBE controls whether explicit load and store memory accesses made from -U-mode are little-endian (UBE=0) or big-endian (UBE=1).

-
-
-

It is always little-endian in U-Mode, the UBE is read-only zero.

-
-
-
-
11.1.1.4. Previous Expected Landing Pad (ELP) State in sstatus Register
-
-

Access to the SPELP field, added by Zicfilp, accesses the homonymous -fields of mstatus when V=0, and the homonymous fields of vsstatus -when V=1.

-
-
-
-
11.1.1.5. Double Trap Control in sstatus Register
-
-

[CV64A6_MMU] As Double Trap Control (Ssdbltrp extension) is not implemented, -SDT field is read-only 0.

-
-
-
-
-

11.1.2. Supervisor Trap Vector Base Address (stvec) Register

-
-

The stvec register is an SXLEN-bit read/write register that holds trap -vector configuration, consisting of a vector base address (BASE) and a -vector mode (MODE).

-
-
-
-Diagram -
-
Figure 29. Supervisor trap vector base address (stvec) register.
-
-
-

The BASE field in stvec is a  field that can hold any valid virtual or -physical address, subject to the following alignment constraints: the -address must be 4-byte aligned, and MODE settings other than Direct -might impose additional alignment constraints on the value in the BASE -field.

-
- - ----- - - - - - - - - - - - - - - -
Table 16. Encoding of stvec MODE field.
ValueNameDescription

0
-1
-≥2

Direct
-Vectored

All exceptions set pc to BASE.
-Asynchronous interrupts set pc to BASE+4×cause.
-Reserved

-
-

The encoding of the MODE field is shown in -Table 16. When MODE=Direct, all traps into -supervisor mode cause the pc to be set to the address in the BASE -field. When MODE=Vectored, all synchronous exceptions into supervisor -mode cause the pc to be set to the address in the BASE field, whereas -interrupts cause the pc to be set to the address in the BASE field -plus four times the interrupt cause number. For example, a -supervisor-mode timer interrupt (see Table 17) -causes the pc to be set to BASE+0x14. Setting MODE=Vectored may -impose a stricter alignment constraint on BASE.

-
-
-
-

11.1.3. Supervisor Interrupt (sip and sie) Registers

-
-

The sip register is an SXLEN-bit read/write register containing -information on pending interrupts, while sie is the corresponding -SXLEN-bit read/write register containing interrupt enable bits. -Interrupt cause number i (as reported in CSR scause, -Section 11.1.8) corresponds with bit i in both sip and -sie. Bits 15:0 are allocated to standard interrupt causes only, while -bits 16 and above are designated for platform use.

-
-
-
-Diagram -
-
Figure 30. Supervisor interrupt-pending register (sip).
-
-
-
-Diagram -
-
Figure 31. Supervisor interrupt-enable register (sie).
-
-
-

An interrupt i will trap to S-mode if both of the following are true: -(a) either the current privilege mode is S and the SIE bit in the -sstatus register is set, or the current privilege mode has less -privilege than S-mode; and (b) bit i is set in both sip and sie.

-
-
-

These conditions for an interrupt trap to occur must be evaluated in a -bounded amount of time from when an interrupt becomes, or ceases to be, -pending in sip, and must also be evaluated immediately following the -execution of an SRET instruction or an explicit write to a CSR on which -these interrupt trap conditions expressly depend (including sip, sie -and sstatus).

-
-
-

Interrupts to S-mode take priority over any interrupts to lower -privilege modes.

-
-
-

Each individual bit in register sip may be writable or may be -read-only. When bit i in sip is writable, a pending interrupt i -can be cleared by writing 0 to this bit. If interrupt i can become -pending but bit i in sip is read-only, the implementation must -provide some other mechanism for clearing the pending interrupt (which -may involve a call to the execution environment).

-
-
-

A bit in sie must be writable if the corresponding interrupt can ever -become pending. Bits of sie that are not writable are read-only zero.

-
-
-

The standard portions (bits 15:0) of registers sip and sie are -formatted as shown in Figures Figure 32 -and Figure 33 respectively.

-
-
-
-Diagram -
-
Figure 32. Standard portion (bits 15:0) of sip.
-
-
-
-Diagram -
-
Figure 33. Standard portion (bits 15:0) of sie.
-
-
-

Bits sip.SEIP and sie.SEIE are the interrupt-pending and -interrupt-enable bits for supervisor-level external interrupts. If -implemented, SEIP is read-only in sip, and is set and cleared by the -execution environment, typically through a platform-specific interrupt -controller.

-
-
-

Bits sip.STIP and sie.STIE are the interrupt-pending and -interrupt-enable bits for supervisor-level timer interrupts. If -implemented, STIP is read-only in sip, and is set and cleared by the -execution environment.

-
-
-

Bits sip.SSIP and sie.SSIE are the interrupt-pending and -interrupt-enable bits for supervisor-level software interrupts. If -implemented, SSIP is writable in sip and may also be set to 1 by a -platform-specific interrupt controller.

-
-
-

Each standard interrupt type (SEI, STI, SSI, or LCOFI) may not be implemented, -in which case the corresponding interrupt-pending and interrupt-enable -bits are read-only zeros. All bits in sip and sie are WARL fields. The -implemented interrupts may be found by writing one to every bit location -in sie, then reading back to see which bit positions hold a one.

-
-
-
-

11.1.4. Supervisor Timers and Performance Counters

-
-

Supervisor software uses the same hardware performance monitoring -facility as user-mode software, including the time, cycle, and -instret CSRs. The implementation should provide a mechanism to modify -the counter values.

-
-
-

The implementation must provide a facility for scheduling timer -interrupts in terms of the real-time counter, time.

-
-
-
-

11.1.5. Counter-Enable (scounteren) Register

-
-
-Diagram -
-
Figure 34. Counter-enable (scounteren) register
-
-
-

The counter-enable (scounteren) CSR is a 32-bit register that -controls the availability of the hardware performance monitoring -counters to U-mode.

-
-
-

When the CY, TM, IR, or HPMn bit in the scounteren register is -clear, attempts to read the cycle, time, instret, or hpmcountern -register while executing in U-mode will cause an illegal-instruction -exception. When one of these bits is set, access to the corresponding -register is permitted.

-
-
-
-

11.1.6. Supervisor Scratch (sscratch) Register

-
-

The sscratch CSR is an SXLEN-bit read/write register, dedicated -for use by the supervisor. Typically, sscratch is used to hold a -pointer to the hart-local supervisor context while the hart is executing -user code. At the beginning of a trap handler, sscratch is swapped -with a user register to provide an initial working register.

-
-
-
-Diagram -
-
Figure 35. Supervisor Scratch Register
-
-
-
-

11.1.7. Supervisor Exception Program Counter (sepc) Register

-
-

sepc is an SXLEN-bit read/write CSR formatted as shown in -Figure 36. The low bit of sepc (sepc[0]) is always zero. On implementations that support only IALIGN=32, the two low bits (sepc[1:0]) are always zero.

-
-
-

sepc is a WARL register that must be able to hold all valid virtual -addresses. It need not be capable of holding all possible invalid -addresses. Prior to writing sepc, implementations may convert an -invalid address into some other invalid address that sepc is capable -of holding.

-
-
-

When a trap is taken into S-mode, sepc is written with the virtual -address of the instruction that was interrupted or that encountered the -exception. Otherwise, sepc is never written by the implementation, -though it may be explicitly written by software.

-
-
-
-Diagram -
-
Figure 36. Supervisor exception program counter register.
-
-
-
-

11.1.8. Supervisor Cause (scause) Register

-
-

The scause CSR is an SXLEN-bit read-write register formatted as -shown in Figure 37. When a trap is taken into -S-mode, scause is written with a code indicating the event that -caused the trap. Otherwise, scause is never written by the -implementation, though it may be explicitly written by software.

-
-
-

The Interrupt bit in the scause register is set if the trap was caused -by an interrupt. The Exception Code field contains a code identifying -the last exception or interrupt. Table 17 lists -the possible exception codes for the current supervisor ISAs. The -Exception Code is a WLRL field. It is required to hold the values 0–31 -(i.e., bits 4–0 must be implemented), but otherwise it is only -guaranteed to hold supported exception codes.

-
-
-
-Diagram -
-
Figure 37. Supervisor Cause (scause) register.
-
- - ----- - - - - - - - - - - - - - - - - - - - -
Table 17. Supervisor cause (scause) register values after trap. Synchronous exception priorities are given by Table 13.
InterruptException CodeDescription

1
-1
-1
-1
-1
-1
-1
-1
-1
-1

0
-1
-2-4
-5
-6-8
-9
-10-12
-13
-14-15
-≥16

Reserved
-Supervisor software interrupt
-Reserved
-Supervisor timer interrupt
-Reserved
-Supervisor external interrupt
-Reserved
-Counter-overflow interrupt
-Reserved
-Designated for platform use

0
-0
-0
-0
-0
-0
-0
-0
-0
-0
-0
-0
-0
-0
-0
-0
-0
-0
-0
-0

0
-1
-2
-3
-4
-5
-6
-7
-8
-9
-10-11
-12
-13
-14
-15
-16-17
-18
-19
-20-23
-24-31
-32-47
-48-63
-≥64

Instruction address misaligned
-Instruction access fault
-Illegal instruction
-Breakpoint
-Load address misaligned
-Load access fault
-Store/AMO address misaligned
-Store/AMO access fault
-Environment call from U-mode
-Environment call from S-mode
-Reserved
-Instruction page fault
-Load page fault
-Reserved
-Store/AMO page fault
-Reserved
-Software check
-Hardware error
-Reserved
-Designated for custom use
-Reserved
-Designated for custom use
-Reserved

-
-
-

11.1.9. Supervisor Trap Value (stval) Register

-
-

[CV64A6_MMU] The stval register is an MXLEN-bit read-only 0 register.

-
-
-
-

11.1.10. Supervisor Environment Configuration (senvcfg) Register

-
-

The senvcfg CSR is an SXLEN-bit read/write register, formatted as -shown in Figure 38, that controls certain -characteristics of the U-mode execution environment.

-
-
-
-Diagram -
-
Figure 38. Supervisor environment configuration register (senvcfg) for RV64.
-
-
-

If bit FIOM (Fence of I/O implies Memory) is set to one in senvcfg, -FENCE instructions executed in U-mode are modified so the requirement to -order accesses to device I/O implies also the requirement to order main -memory accesses. Table 18 details the modified -interpretation of FENCE instruction bits PI, PO, SI, and SO in U-mode -when FIOM=1.

-
-
-

Similarly, for U-mode when FIOM=1, if an atomic instruction that -accesses a region ordered as device I/O has its aq and/or rl bit -set, then that instruction is ordered as though it accesses both device -I/O and memory.

-
- - ---- - - - - - - - - - - - - - - - - -
Table 18. Modified interpretation of FENCE predecessor and successor sets in U-mode when FIOM=1.
Instruction bitMeaning when set

PI
-PO

Predecessor device input and memory reads (PR implied)
-Predecessor device output and memory writes (PW implied)

SI
-SO

Successor device input and memory reads (SR implied)
-Successor device output and memory writes (SW implied)

-
-

[CV64A6_MMU] CBZE, CBCFE, CBIE, PMM, LPE, ELP, SSE are always 0 because their corresponding extension is not implemented

-
-
-
-

11.1.11. Supervisor Address Translation and Protection (satp) Register

-
-

The satp CSR is an SXLEN-bit read/write register, formatted as -shown in Figure 39, which controls -supervisor-mode address translation and protection. This register holds -the physical page number (PPN) of the root page table, i.e., its -supervisor physical address divided by 4 KiB; an address space identifier -(ASID), which facilitates address-translation fences on a -per-address-space basis; and the MODE field, which selects the current -address-translation scheme. Further details on the access to this -register are described in Section 3.1.6.6.

-
-
-
-Diagram -
-
Figure 39. Supervisor address translation and protection (satp) register when SXLEN=64, for MODE values Bare, Sv39, Sv48, and Sv57.
-
-
-

Table 19 shows the encodings of the MODE field when -SXLEN=32 and SXLEN=64. When MODE=Bare, supervisor virtual addresses are -equal to supervisor physical addresses, and there is no additional -memory protection beyond the physical memory protection scheme described -in Section 3.7

-
-
-

[CV64A6_MMU] When SXLEN=64, the only other valid setting for MODE is Sv39, a paged -virtual-memory scheme described in Section 11.3.

-
-
-

The number of ASID bits is UNSPECIFIED and may be zero. The number of implemented -ASID bits, termed ASIDLEN, may be determined by writing one to every -bit position in the ASID field, then reading back the value in satp to -see which bit positions in the ASID field hold a one. The -least-significant bits of ASID are implemented first: that is, if -ASIDLEN stem d31e2e21d71d81f6406a849380df2641 0, ASID[ASIDLEN-1:0] is writable. The maximal -value of ASIDLEN, termed ASIDMAX, is 9 for Sv32 or 16 for Sv39, Sv48, -and Sv57.

-
-
- - ----- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 19. Encoding of satp MODE field.
SXLEN=32

Value

Name

Description

0
-1

Bare
-Sv32

No translation or protection.
-Page-based 32-bit virtual addressing (see [sv32]).

SXLEN=64

Value

Name

Description

0
-1-7
-8
-9
-10
-11
-12-13
-14-15

Bare
--
-Sv39
-Sv48
-Sv57
-Sv64
--
--

No translation or protection.
-Reserved for standard use
-Page-based 39-bit virtual addressing (see Section 11.3).
-Page-based 48-bit virtual addressing (see [sv48]).
-Page-based 57-bit virtual addressing (see [sv57]).
-Reserved for page-based 64-bit virtual addressing.
-Reserved for standard use
-Designated for custom use

-
-

The satp CSR is considered active when the effective privilege -mode is S-mode or U-mode. Executions of the address-translation -algorithm may only begin using a given value of satp when satp is -active.

-
-
-

Note that writing satp does not imply any ordering constraints between -page-table updates and subsequent address translations, nor does it -imply any invalidation of address-translation caches. If the new address -space’s page tables have been modified, or if an ASID is reused, it may -be necessary to execute an SFENCE.VMA instruction (see -Section 11.2.1) after, or in some cases before, writing -satp.

-
-
-
-
-

11.2. Supervisor Instructions

-
-

In addition to the SRET instruction defined in Section 3.3.2, one new supervisor-level instruction is provided.

-
-
-

11.2.1. Supervisor Memory-Management Fence Instruction

-
-
-Diagram -
-
-
-

The supervisor memory-management fence instruction SFENCE.VMA is used to -synchronize updates to in-memory memory-management data structures with -current execution. Instruction execution causes implicit reads and -writes to these data structures; however, these implicit references are -ordinarily not ordered with respect to explicit loads and stores. -Executing an SFENCE.VMA instruction guarantees that any previous stores -already visible to the current RISC-V hart are ordered before certain -implicit references by subsequent instructions in that hart to the -memory-management data structures. The specific set of operations -ordered by SFENCE.VMA is determined by rs1 and rs2, as described -below. SFENCE.VMA is also used to invalidate entries in the -address-translation cache associated with a hart (see [sv32algorithm]). Further details on the behavior of this instruction are described in Section 3.1.6.6 and Section 3.7.2.

-
-
-

SFENCE.VMA orders only the local hart’s implicit references to the -memory-management data structures.

-
-
-
-
-

11.3. Sv39: Page-Based 39-bit Virtual-Memory System

-
-

This section describes a simple paged virtual-memory system for -SXLEN=64, which supports 39-bit virtual address spaces. The design of -Sv39 follows the overall scheme of Sv32, and this section details only -the differences between the schemes.

-
-
-

11.3.1. Addressing and Memory Protection

-
-

Sv39 implementations support a 39-bit virtual address space, divided -into pages. An Sv39 address is partitioned as shown in -Figure 40. Instruction fetch addresses and load and -store effective addresses, which are 64 bits, must have bits 63–39 all -equal to bit 38, or else a page-fault exception will occur. The 27-bit -VPN is translated into a 44-bit PPN via a three-level page table, while -the 12-bit page offset is untranslated.

-
-
-
-Diagram -
-
Figure 40. Sv39 virtual address.
-
-
-
-Diagram -
-
Figure 41. Sv39 physical address.
-
-
-
-Diagram -
-
Figure 42. Sv39 page table entry.
-
-
-

Sv39 page tables contain 29 page table entries (PTEs), -eight bytes each. A page table is exactly the size of a page and must -always be aligned to a page boundary. The physical page number of the -root page table is stored in the satp register’s PPN field.

-
-
-

The PTE format for Sv39 is shown in Figure 42.

-
-
-

The V bit indicates whether the PTE is valid; if it is 0, all other bits -in the PTE are don’t-cares and may be used freely by software. The -permission bits, R, W, and X, indicate whether the page is readable, -writable, and executable, respectively. When all three are zero, the PTE -is a pointer to the next level of the page table; otherwise, it is a -leaf PTE. Writable pages must also be marked readable; the contrary -combinations are reserved for future use. Table 20 -summarizes the encoding of the permission bits.

-
- - ------ - - - - - - - - - - - - - - - - -
Table 20. Encoding of PTE R/W/X fields.
XWRMeaning

0
-0
-0
-0
-1
-1
-1
-1

0
-0
-1
-1
-0
-0
-1
-1

0
-1
-0
-1
-0
-1
-0
-1

Pointer to next level of page table.
-Read-only page.
-Reserved for future use.
-Read-write page.
-Execute-only page.
-Read-execute page.
-Reserved for future use.
-Read-write-execute page.

-
-

Attempting to fetch an instruction from a page that does not have -execute permissions raises a fetch page-fault exception. Attempting to -execute a load or load-reserved instruction whose effective address lies -within a page without read permissions raises a load page-fault -exception. Attempting to execute a store, store-conditional, or AMO -instruction whose effective address lies within a page without write -permissions raises a store page-fault exception.

-
-
-

The U bit indicates whether the page is accessible to user mode. U-mode -software may only access the page when U=1. If the SUM bit in the -sstatus register is set, supervisor mode software may also access -pages with U=1. However, supervisor code normally operates with the SUM -bit clear, in which case, supervisor code will fault on accesses to -user-mode pages.

-
-
-

The G bit designates a global mapping. Global mappings are those that -exist in all address spaces. For non-leaf PTEs, the global setting -implies that all mappings in the subsequent levels of the page table are -global.

-
-
-

The RSW field is reserved for use by supervisor softwareand is ignored by the implementation.

-
-
-

[CV64A6_MMU] As Svnapot is not implemented bit 63 remains reserved and must be zeroed by software for -forward compatibility, or else a page-fault exception is raised.

-
-
-

[CV64A6_MMU] As Svpbmt is not implemented bits 62-61 remain -reserved and must be zeroed by software for forward compatibility, or -else a page-fault exception is raised.

-
-
-

Bits 60-54 are reserved for -future standard use and, until their use is defined by some standard -extension, must be zeroed by software for forward compatibility. If any -of these bits are set, a page-fault exception is raised.

-
-
-
-
-
-
-

12. "Sstc" Extension for Supervisor-mode Timer Interrupts, Version 1.0

-
-
-

CV64A6_MMU: This extension is not supported.

-
-
-
-
-

13. "Sscofpmf" Extension for Count Overflow and Mode-Based Filtering, Version 1.0

-
-
-

CV64A6_MMU: This extension is not supported.

-
-
-
-
-

14. "H" Extension for Hypervisor Support, Version 1.0

-
-
-

CV64A6_MMU: This extension is not supported.

-
-
-
-
-

15. Control-flow Integrity (CFI)

-
-
-

CV64A6_MMU: The Zicfiss extension is not supported.

-
-
-

CV64A6_MMU: The Zicfilp extension is not supported.

-
-
-
-
-

16. "Ssdbltrp" Double Trap Extension, Version 1.0

-
- -
-
-
-

17. RISC-V Privileged Instruction Set Listings

-
-
-

This chapter presents instruction-set listings for all instructions -defined in the RISC-V Privileged Architecture.

-
-
-

The instruction-set listings for unprivileged instructions, including -the ECALL and EBREAK instructions, are provided in Volume I of this -manual.

-
-
-
-Diagram -
-
Figure 43. RISC-V Privileged Instructions
-
-
-
-
-

18. History

-
-
-

18.1. Research Funding at UC Berkeley

-
-

Development of the RISC-V architecture and implementations has been -partially funded by the following sponsors.

-
-
-
    -
  • -

    Par Lab: Research supported by Microsoft (Award #024263) and Intel -(Award #024894) funding and by matching funding by U.C. Discovery (Award -#DIG07-10227). Additional support came from Par Lab affiliates Nokia, -NVIDIA, Oracle, and Samsung.

    -
  • -
  • -

    Project Isis: DoE Award DE-SC0003624.

    -
  • -
  • -

    ASPIRE Lab: DARPA PERFECT program, Award HR0011-12-2-0016. DARPA -POEM program Award HR0011-11-C-0100. The Center for Future Architectures -Research (C-FAR), a STARnet center funded by the Semiconductor Research -Corporation. Additional support from ASPIRE industrial sponsor, Intel, -and ASPIRE affiliates, Google, Huawei, Nokia, NVIDIA, Oracle, and -Samsung.

    -
  • -
-
-
-

The content of this paper does not necessarily reflect the position or -the policy of the US government and no official endorsement should be -inferred.

-
-
-
-
-
-

Bibliography

-
-
-

Goldberg, R. P. (1974). Survey of virtual machine research. Computer, 7(6), 34–45.

-
-
-
-
- - - \ No newline at end of file diff --git a/docs/06_cv64a6_mmu/riscv/src/config.adoc b/docs/06_cv64a6_mmu/riscv/src/config.adoc deleted file mode 100644 index b9040b7532..0000000000 --- a/docs/06_cv64a6_mmu/riscv/src/config.adoc +++ /dev/null @@ -1,23 +0,0 @@ -:ohg-config: CV64A6_MMU -:XLEN: 64 -:RVA: false -:RVC: true -:RVS: true -:RVU: true -:RVH: false -:SV: SV0 -:RVZicfilp: false -:RVZicfiss: false -:RVZsmstateen: false -:RVZsmcsrind-RVZsscsrind: false -:RVZsmepmp: false -:RVZsmcntrpmf: false -:RVZsmrnmi: false -:RVZsmcdeleg: false -:RVZsstc: false -:RVZsscofpmf: false -:RVZsmmpm: false -:DCacheEn: false -:MTvalEn: false -:MTvecDirectEn: true -:note: false diff --git a/docs/06_cv64a6_mmu/riscv/unpriv-isa-cv64a6_mmu.html b/docs/06_cv64a6_mmu/riscv/unpriv-isa-cv64a6_mmu.html deleted file mode 100644 index 9187b962a0..0000000000 --- a/docs/06_cv64a6_mmu/riscv/unpriv-isa-cv64a6_mmu.html +++ /dev/null @@ -1,26917 +0,0 @@ - - - - - - - - -The RISC-V Instruction Set Manual for CV64A6_MMU: Volume I - Unprivileged Architecture - - - - - - -
-
-
-
-

This document describes the RISC-V unprivileged architecture tailored for -OpenHW Group CV64A6_MMU. -Not relevant parts (e.g. unsupported extensions) of the original -specification are replaced by placeholders.

-
-
-

Contributors to all versions of the spec in alphabetical order (please contact editors to suggest -corrections): Derek Atkins, -Arvind, -Krste Asanović, -Rimas Avižienis, -Jacob Bachmeyer, -Christopher F. Batten, -Allen J. Baum, -Abel Bernabeu, -Alex Bradbury, -Scott Beamer, -Hans Boehm, -Preston Briggs, -Christopher Celio, -Chuanhua Chang, -David Chisnall, -Paul Clayton, -Palmer Dabbelt, -L Peter Deutsch, -Ken Dockser, -Paul Donahue, -Aaron Durbin, -Roger Espasa, -Greg Favor, -Andy Glew, -Shaked Flur, -Stefan Freudenberger, -Marc Gauthier, -Andy Glew, -Jan Gray, -Gianluca Guida, -Michael Hamburg, -John Hauser, -John Ingalls, -David Horner, -Bruce Hoult, -Bill Huffman, -Alexandre Joannou, -Olof Johansson, -Ben Keller, -David Kruckemyer, -Tariq Kurd, -Yunsup Lee, -Paul Loewenstein, -Daniel Lustig, -Yatin Manerkar, -Luc Maranget, -Ben Marshall, -Margaret Martonosi, -Phil McCoy, -Nathan Menhorn, -Christoph Müllner, -Joseph Myers, -Vijayanand Nagarajan, -Rishiyur Nikhil, -Jonas Oberhauser, -Stefan O’Rear, -Markku-Juhani O. Saarinen, -Albert Ou, -John Ousterhout, -Daniel Page, -David Patterson, -Christopher Pulte, -Jose Renau, -Josh Scheid, -Colin Schmidt, -Peter Sewell, -Susmit Sarkar, -Ved Shanbhogue, -Brent Spinney, -Brendan Sweeney, -Michael Taylor, -Wesley Terpstra, -Matt Thomas, -Tommy Thorn, -Philipp Tomsich, -Caroline Trippel, -Ray VanDeWalker, -Muralidaran Vijayaraghavan, -Megan Wachs, -Paul Wamsley, -Andrew Waterman, -Robert Watson, -David Weaver, -Derek Williams, -Claire Wolf, -Andrew Wright, -Reinoud Zandijk, -and Sizhuo Zhang.

-
-
-

This document is released under a Creative Commons Attribution 4.0 International License.

-
-
-

This document is a derivative of “The RISC-V Instruction Set Manual, Volume I: User-Level ISA -Version 2.1” released under the following license: ©2010-2017 Andrew Waterman, Yunsup Lee, -David Patterson, Krste Asanović. Creative Commons Attribution 4.0 International License. -Please cite as: “The RISC-V Instruction Set Manual, Volume I: User-Level ISA, Document -Version 20191214-draft”, Editors Andrew Waterman and Krste Asanović, RISC-V Foundation, -December 2019.

-
-
-

Contributors to CV64A6_MMU versions of the spec in alphabetical order: -Jean-Roch Coulon, André Sintzoff.

-
-
-
-
-

Preface

-
-
-

Preface to Document Version for CV64A6_MMU

-
-
-

This document describes the RISC-V unprivileged architecture tailored for -OpenHW Group CV64A6_MMU.

-
-
-

Preface to Document Version 20240703

-
-
-

This document describes the RISC-V unprivileged architecture.

-
-
-

The ISA modules marked Ratified have been ratified at this time. The -modules marked Frozen are not expected to change significantly before -being put up for ratification. The modules marked Draft are expected -to change before ratification.

-
-
-

The document contains the following versions of the RISC-V ISA modules:

-
- ----- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
BaseVersionStatus

RV32I

2.1

Ratified

RV32E

2.0

Ratified

RV64E

2.0

Ratified

RV64I

2.1

Ratified

RV128I

1.7

Draft

Extension

Version

Status

Zifencei

2.0

Ratified

Zicsr

2.0

Ratified

Zicntr

2.0

Ratified

Zihintntl

1.0

Ratified

Zihintpause

2.0

Ratified

Zimop

1.0

Ratified

Zicond

1.0

Ratified

M

2.0

Ratified

Zmmul

1.0

Ratified

A

2.1

Ratified

Zawrs

1.01

Ratified

Zacas

1.0

Ratifed

Zabha

1.0

Ratifed

RVWMO

2.0

Ratified

Ztso

1.0

Ratified

CMO

1.0

Ratified

F

2.2

Ratified

D

2.2

Ratified

Q

2.2

Ratified

Zfh

1.0

Ratified

Zfhmin

1.0

Ratified

Zfa

1.0

Ratified

Zfinx

1.0

Ratified

Zdinx

1.0

Ratified

Zhinx

1.0

Ratified

Zhinxmin

1.0

Ratified

C

2.0

Ratified

Zce

1.0

Ratified

B

1.0

Ratified

P

0.2

Draft

V

1.0

Ratified

Zbkb

1.0

Ratified

Zbkc

1.0

Ratified

Zbkx

1.0

Ratified

Zk

1.0

Ratified

Zks

1.0

Ratified

Zvbb

1.0

Ratified

Zvbc

1.0

Ratified

Zvkg

1.0

Ratified

Zvkned

1.0

Ratified

Zvknhb

1.0

Ratified

Zvksed

1.0

Ratified

Zvksh

1.0

Ratified

Zvkt

1.0

Ratified

Zicfiss

1.0

Ratified

Zicfilp

1.0

Ratified

-
-

The changes in this version of the document include:

-
-
-
    -
  • -

    The inclusion of all ratified extensions through March 2024.

    -
  • -
  • -

    The draft Zam extension has been removed, in favor of the definition of a misaligned atomicity granule PMA.

    -
  • -
  • -

    The concept of vacant memory regions has been superseded by inaccessible memory or I/O regions.

    -
  • -
-
-
-

Preface to Document Version 20191213-Base-Ratified

-
-
-

This document describes the RISC-V unprivileged architecture.

-
-
-

The ISA modules marked Ratified have been ratified at this time. The -modules marked Frozen are not expected to change significantly before -being put up for ratification. The modules marked Draft are expected -to change before ratification.

-
-
-

The document contains the following versions of the RISC-V ISA modules:

-
- ----- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
BaseVersionStatus

RVWMO

2.0

Ratified

RV32I

2.1

Ratified

RV64I

2.1

Ratified

RV32E

1.9

Draft

RV128I

1.7

Draft

Extension

Version

Status

M

2.0

Ratified

A

2.1

Ratified

F

2.2

Ratified

D

2.2

Ratified

Q

2.2

Ratified

C

2.0

Ratified

Counters

2.0

Draft

L

0.0

Draft

B

0.0

Draft

J

0.0

Draft

T

0.0

Draft

P

0.2

Draft

V

0.7

Draft

Zicsr

2.0

Ratified

Zifencei

2.0

Ratified

Zam

0.1

Draft

Ztso

0.1

Frozen

-
-

The changes in this version of the document include:

-
-
-
    -
  • -

    The A extension, now version 2.1, was ratified by the board in -December 2019.

    -
  • -
  • -

    Defined big-endian ISA variant.

    -
  • -
  • -

    Moved N extension for user-mode interrupts into Volume II.

    -
  • -
  • -

    Defined PAUSE hint instruction.

    -
  • -
-
-
-

Preface to Document Version 20190608-Base-Ratified

-
-
-

This document describes the RISC-V unprivileged architecture.

-
-
-

The RVWMO memory model has been ratified at this time. The ISA modules -marked Ratified, have been ratified at this time. The modules marked -Frozen are not expected to change significantly before being put up -for ratification. The modules marked Draft are expected to change -before ratification.

-
-
-

The document contains the following versions of the RISC-V ISA modules:

-
- ----- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
BaseVersionStatus

RVWMO

2.0

Ratified

RV32I

2.1

Ratified

RV64I

2.1

Ratified

RV32E

1.9

Draft

RV128I

1.7

Draft

Extension

Version

Status

Zifencei

2.0

Ratified

Zicsr

2.0

Ratified

M

2.0

Ratified

A

2.0

Frozen

F

2.2

Ratified

D

2.2

Ratified

Q

2.2

Ratified

C

2.0

Ratified

Ztso

0.1

Frozen

Counters

2.0

Draft

L

0.0

Draft

B

0.0

Draft

J

0.0

Draft

T

0.0

Draft

P

0.2

Draft

V

0.7

Draft

Zam

0.1

Draft

-
-

The changes in this version of the document include:

-
-
-
    -
  • -

    Moved description to Ratified for the ISA modules ratified by the -board in early 2019.

    -
  • -
  • -

    Removed the A extension from ratification.

    -
  • -
  • -

    Changed document version scheme to avoid confusion with versions of -the ISA modules.

    -
  • -
  • -

    Incremented the version numbers of the base integer ISA to 2.1, -reflecting the presence of the ratified RVWMO memory model and exclusion -of FENCE.I, counters, and CSR instructions that were in previous base -ISA.

    -
  • -
  • -

    Incremented the version numbers of the F and D extensions to 2.2, -reflecting that version 2.1 changed the canonical NaN, and version 2.2 -defined the NaN-boxing scheme and changed the definition of the FMIN and -FMAX instructions.

    -
  • -
  • -

    Changed name of document to refer to "unprivileged" instructions as -part of move to separate ISA specifications from platform profile -mandates.

    -
  • -
  • -

    Added clearer and more precise definitions of execution environments, -harts, traps, and memory accesses.

    -
  • -
  • -

    Defined instruction-set categories: standard, reserved, custom, -non-standard, and non-conforming.

    -
  • -
  • -

    Removed text implying operation under alternate endianness, as -alternate-endianness operation has not yet been defined for RISC-V.

    -
  • -
  • -

    Changed description of misaligned load and store behavior. The -specification now allows visible misaligned address traps in execution -environment interfaces, rather than just mandating invisible handling of -misaligned loads and stores in user mode. Also, now allows access-fault -exceptions to be reported for misaligned accesses (including atomics) -that should not be emulated.

    -
  • -
  • -

    Moved FENCE.I out of the mandatory base and into a separate extension, -with Zifencei ISA name. FENCE.I was removed from the Linux user ABI and -is problematic in implementations with large incoherent instruction and -data caches. However, it remains the only standard instruction-fetch -coherence mechanism.

    -
  • -
  • -

    Removed prohibitions on using RV32E with other extensions.

    -
  • -
  • -

    Removed platform-specific mandates that certain encodings produce -illegal-instruction exceptions in RV32E and RV64I chapters.

    -
  • -
  • -

    Counter/timer instructions are now not considered part of the -mandatory base ISA, and so CSR instructions were moved into separate -chapter and marked as version 2.0, with the unprivileged counters moved -into another separate chapter. The counters are not ready for -ratification as there are outstanding issues, including counter -inaccuracies.

    -
  • -
  • -

    A CSR-access ordering model has been added.

    -
  • -
  • -

    Explicitly defined the 16-bit half-precision floating-point format for -floating-point instructions in the 2-bit fmt field.

    -
  • -
  • -

    Defined the signed-zero behavior of FMIN.fmt and FMAX.fmt, and -changed their behavior on signaling-NaN inputs to conform to the -minimumNumber and maximumNumber operations in the proposed IEEE 754-201x -specification.

    -
  • -
  • -

    The memory consistency model, RVWMO, has been defined.

    -
  • -
  • -

    The "Zam" extension, which permits misaligned AMOs and specifies -their semantics, has been defined.

    -
  • -
  • -

    The "Ztso" extension, which enforces a stricter memory consistency -model than RVWMO, has been defined.

    -
  • -
  • -

    Improvements to the description and commentary.

    -
  • -
  • -

    Defined the term IALIGN as shorthand to describe the -instruction-address alignment constraint.

    -
  • -
  • -

    Removed text of P extension chapter as now superseded by active task -group documents.

    -
  • -
  • -

    Removed text of V extension chapter as now superseded by separate -vector extension draft document.

    -
  • -
-
-
-

Preface to Document Version 2.2

-
-
-

This is version 2.2 of the document describing the RISC-V user-level -architecture. The document contains the following versions of the RISC-V -ISA modules:

-
- ----- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
BaseVersionDraft Frozen?

RV32I

2.0

Y

RV32E

1.9

N

RV64I

2.0

Y

RV128I

1.7

N

Extension

Version

Frozen?

M

2.0

Y

A

2.0

Y

F

2.0

Y

D

2.0

Y

Q

2.0

Y

L

0.0

N

C

2.0

Y

B

0.0

N

J

0.0

N

T

0.0

N

P

0.1

N

V

0.7

N

N

1.1

N

-
-

To date, no parts of the standard have been officially ratified by the -RISC-V Foundation, but the components labeled "frozen" above are not -expected to change during the ratification process beyond resolving -ambiguities and holes in the specification.

-
-
-

The major changes in this version of the document include:

-
-
-
    -
  • -

    The previous version of this document was released under a Creative -Commons Attribution 4.0 International License by the original authors, -and this and future versions of this document will be released under the -same license.

    -
  • -
  • -

    Rearranged chapters to put all extensions first in canonical order.

    -
  • -
  • -

    Improvements to the description and commentary.

    -
  • -
  • -

    Modified implicit hinting suggestion on JALR to support more efficient -macro-op fusion of LUI/JALR and AUIPC/JALR pairs.

    -
  • -
  • -

    Clarification of constraints on load-reserved/store-conditional -sequences.

    -
  • -
  • -

    A new table of control and status register (CSR) mappings.

    -
  • -
  • -

    Clarified purpose and behavior of high-order bits of fcsr.

    -
  • -
  • -

    Corrected the description of the FNMADD.fmt and FNMSUB.fmt -instructions, which had suggested the incorrect sign of a zero result.

    -
  • -
  • -

    Instructions FMV.S.X and FMV.X.S were renamed to FMV.W.X and FMV.X.W -respectively to be more consistent with their semantics, which did not -change. The old names will continue to be supported in the tools.

    -
  • -
  • -

    Specified behavior of narrower (stem cec1626943fd22b8bc5d3763224c7c1fFLEN) floating-point -values held in wider f registers using NaN-boxing model.

    -
  • -
  • -

    Defined the exception behavior of FMA(stem 66ad238aa9cc3ef0bde4119bf3f5e449, 0, qNaN).

    -
  • -
  • -

    Added note indicating that the P extension might be reworked into an -integer packed-SIMD proposal for fixed-point operations using the -integer registers.

    -
  • -
  • -

    A draft proposal of the V vector instruction-set extension.

    -
  • -
  • -

    An early draft proposal of the N user-level traps extension.

    -
  • -
  • -

    An expanded pseudoinstruction listing.

    -
  • -
  • -

    Removal of the calling convention chapter, which has been superseded -by the RISC-V ELF psABI Specification (RISC-V ELF PsABI Specification, n.d.).

    -
  • -
  • -

    The C extension has been frozen and renumbered version 2.0.

    -
  • -
-
-
-

Preface to Document Version 2.1

-
-
-

This is version 2.1 of the document describing the RISC-V user-level -architecture. Note the frozen user-level ISA base and extensions IMAFDQ -version 2.0 have not changed from the previous version of this -document (Waterman et al., 2014), but some specification holes have been fixed and the -documentation has been improved. Some changes have been made to the -software conventions.

-
-
-
    -
  • -

    Numerous additions and improvements to the commentary sections.

    -
  • -
  • -

    Separate version numbers for each chapter.

    -
  • -
  • -

    Modification to long instruction encodings stem d31e2e21d71d81f6406a849380df264164 bits to -avoid moving the rd specifier in very long instruction formats.

    -
  • -
  • -

    CSR instructions are now described in the base integer format where -the counter registers are introduced, as opposed to only being -introduced later in the floating-point section (and the companion -privileged architecture manual).

    -
  • -
  • -

    The SCALL and SBREAK instructions have been renamed to ECALL and -EBREAK, respectively. Their encoding and functionality are unchanged.

    -
  • -
  • -

    Clarification of floating-point NaN handling, and a new canonical NaN -value.

    -
  • -
  • -

    Clarification of values returned by floating-point to integer -conversions that overflow.

    -
  • -
  • -

    Clarification of LR/SC allowed successes and required failures, -including use of compressed instructions in the sequence.

    -
  • -
  • -

    A new RV32E base ISA proposal for reduced integer register counts, -supports MAC extensions.

    -
  • -
  • -

    A revised calling convention.

    -
  • -
  • -

    Relaxed stack alignment for soft-float calling convention, and -description of the RV32E calling convention.

    -
  • -
  • -

    A revised proposal for the C compressed extension, version 1.9 .

    -
  • -
-
-
-

Preface to Version 2.0

-
-
-

This is the second release of the user ISA specification, and we intend -the specification of the base user ISA plus general extensions (i.e., -IMAFD) to remain fixed for future development. The following changes -have been made since Version 1.0 (Waterman et al., 2011) of this ISA specification.

-
-
-
    -
  • -

    The ISA has been divided into an integer base with several standard -extensions.

    -
  • -
  • -

    The instruction formats have been rearranged to make immediate -encoding more efficient.

    -
  • -
  • -

    The base ISA has been defined to have a little-endian memory system, -with big-endian or bi-endian as non-standard variants.

    -
  • -
  • -

    Load-Reserved/Store-Conditional (LR/SC) instructions have been added -in the atomic instruction extension.

    -
  • -
  • -

    AMOs and LR/SC can support the release consistency model.

    -
  • -
  • -

    The FENCE instruction provides finer-grain memory and I/O orderings.

    -
  • -
  • -

    An AMO for fetch-and-XOR (AMOXOR) has been added, and the encoding for -AMOSWAP has been changed to make room.

    -
  • -
  • -

    The AUIPC instruction, which adds a 20-bit upper immediate to the PC, -replaces the RDNPC instruction, which only read the current PC value. -This results in significant savings for position-independent code.

    -
  • -
  • -

    The JAL instruction has now moved to the U-Type format with an -explicit destination register, and the J instruction has been dropped -being replaced by JAL with rd=x0. This removes the only instruction -with an implicit destination register and removes the J-Type instruction -format from the base ISA. There is an accompanying reduction in JAL -reach, but a significant reduction in base ISA complexity.

    -
  • -
  • -

    The static hints on the JALR instruction have been dropped. The hints -are redundant with the rd and rs1 register specifiers for code -compliant with the standard calling convention.

    -
  • -
  • -

    The JALR instruction now clears the lowest bit of the calculated -target address, to simplify hardware and to allow auxiliary information -to be stored in function pointers.

    -
  • -
  • -

    The MFTX.S and MFTX.D instructions have been renamed to FMV.X.S and -FMV.X.D, respectively. Similarly, MXTF.S and MXTF.D instructions have -been renamed to FMV.S.X and FMV.D.X, respectively.

    -
  • -
  • -

    The MFFSR and MTFSR instructions have been renamed to FRCSR and FSCSR, -respectively. FRRM, FSRM, FRFLAGS, and FSFLAGS instructions have been -added to individually access the rounding mode and exception flags -subfields of the fcsr.

    -
  • -
  • -

    The FMV.X.S and FMV.X.D instructions now source their operands from -rs1, instead of rs2. This change simplifies datapath design.

    -
  • -
  • -

    FCLASS.S and FCLASS.D floating-point classify instructions have been -added.

    -
  • -
  • -

    A simpler NaN generation and propagation scheme has been adopted.

    -
  • -
  • -

    For RV32I, the system performance counters have been extended to -64-bits wide, with separate read access to the upper and lower 32 bits.

    -
  • -
  • -

    Canonical NOP and MV encodings have been defined.

    -
  • -
  • -

    Standard instruction-length encodings have been defined for 48-bit, -64-bit, and stem d31e2e21d71d81f6406a849380df264164-bit instructions.

    -
  • -
  • -

    Description of a 128-bit address space variant, RV128, has been added.

    -
  • -
  • -

    Major opcodes in the 32-bit base instruction format have been -allocated for user-defined custom extensions.

    -
  • -
  • -

    A typographical error that suggested that stores source their data -from rd has been corrected to refer to rs2.

    -
  • -
-
-
-
-
-

1. Introduction

-
-
-

RISC-V (pronounced "risk-five") is a new instruction-set architecture -(ISA) that was originally designed to support computer architecture -research and education, but which we now hope will also become a -standard free and open architecture for industry implementations. Our -goals in defining RISC-V include:

-
-
-
    -
  • -

    A completely open ISA that is freely available to academia and -industry.

    -
  • -
  • -

    A real ISA suitable for direct native hardware implementation, not -just simulation or binary translation.

    -
  • -
  • -

    An ISA that avoids "over-architecting" for a particular -microarchitecture style (e.g., microcoded, in-order, decoupled, -out-of-order) or implementation technology (e.g., full-custom, ASIC, -FPGA), but which allows efficient implementation in any of these.

    -
  • -
  • -

    An ISA separated into a small base integer ISA, usable by itself as -a base for customized accelerators or for educational purposes, and -optional standard extensions, to support general-purpose software -development.

    -
  • -
  • -

    Support for the revised 2008 IEEE-754 floating-point standard. (ANSI/IEEE Std 754-2008, IEEE Standard for Floating-Point Arithmetic, 2008)

    -
  • -
  • -

    An ISA supporting extensive ISA extensions and specialized variants.

    -
  • -
  • -

    Both 32-bit and 64-bit address space variants for applications, -operating system kernels, and hardware implementations.

    -
  • -
  • -

    An ISA with support for highly parallel multicore or manycore -implementations, including heterogeneous multiprocessors.

    -
  • -
  • -

    Optional variable-length instructions to both expand available -instruction encoding space and to support an optional dense instruction -encoding for improved performance, static code size, and energy -efficiency.

    -
  • -
  • -

    A fully virtualizable ISA to ease hypervisor development.

    -
  • -
  • -

    An ISA that simplifies experiments with new privileged architecture -designs.

    -
  • -
-
-
- - - - - -
- - -
-

Commentary on our design decisions is formatted as in this paragraph. -This non-normative text can be skipped if the reader is only interested -in the specification itself.

-
-
-
-
- - - - - -
- - -
-

The name RISC-V was chosen to represent the fifth major RISC ISA design -from UC Berkeley (RISC-I (Patterson & Séquin, 1981), RISC-II (Katevenis et al., 1983), SOAR (Ungar et al., 1984), and SPUR (Lee et al., 1989) were the first -four). We also pun on the use of the Roman numeral "V" to signify -"variations" and "vectors", as support for a range of architecture -research, including various data-parallel accelerators, is an explicit -goal of the ISA design.

-
-
-
-
-

-The RISC-V ISA is defined avoiding implementation details as much as -possible (although commentary is included on implementation-driven -decisions) and should be read as the software-visible interface to a -wide variety of implementations rather than as the design of a -particular hardware artifact. The RISC-V manual is structured in two -volumes. This volume covers the design of the base unprivileged -instructions, including optional unprivileged ISA extensions. -Unprivileged instructions are those that are generally usable in all -privilege modes in all privileged architectures, though behavior might -vary depending on privilege mode and privilege architecture. The second -volume provides the design of the first ("classic") privileged -architecture. The manuals use IEC 80000-13:2008 conventions, with a byte -of 8 bits.

-
-
- - - - - -
- - -
-

In the unprivileged ISA design, we tried to remove any dependence on -particular microarchitectural features, such as cache line size, or on -privileged architecture details, such as page translation. This is both -for simplicity and to allow maximum flexibility for alternative -microarchitectures or alternative privileged architectures.

-
-
-
-
-

1.1. RISC-V Hardware Platform Terminology

-
-

A RISC-V hardware platform can contain one or more RISC-V-compatible -processing cores together with other non-RISC-V-compatible cores, -fixed-function accelerators, various physical memory structures, I/O -devices, and an interconnect structure to allow the components to -communicate. -

-
-
-

A component is termed a core if it contains an independent instruction -fetch unit. A RISC-V-compatible core might support multiple -RISC-V-compatible hardware threads, or harts, through multithreading. -

-
-
-

A RISC-V core might have additional specialized instruction-set -extensions or an added coprocessor. We use the term coprocessor to -refer to a unit that is attached to a RISC-V core and is mostly -sequenced by a RISC-V instruction stream, but which contains additional -architectural state and instruction-set extensions, and possibly some -limited autonomy relative to the primary RISC-V instruction stream.

-
-
-

We use the term accelerator to refer to either a non-programmable -fixed-function unit or a core that can operate autonomously but is -specialized for certain tasks. In RISC-V systems, we expect many -programmable accelerators will be RISC-V-based cores with specialized -instruction-set extensions and/or customized coprocessors. An important -class of RISC-V accelerators are I/O accelerators, which offload I/O -processing tasks from the main application cores. -

-
-
-

The system-level organization of a RISC-V hardware platform can range -from a single-core microcontroller to a many-thousand-node cluster of -shared-memory manycore server nodes. Even small systems-on-a-chip might -be structured as a hierarchy of multicomputers and/or multiprocessors to -modularize development effort or to provide secure isolation between -subsystems. -

-
-
-
-

1.2. RISC-V Software Execution Environments and Harts

-
-

The behavior of a RISC-V program depends on the execution environment in -which it runs. A RISC-V execution environment interface (EEI) defines -the initial state of the program, the number and type of harts in the -environment including the privilege modes supported by the harts, the -accessibility and attributes of memory and I/O regions, the behavior of -all legal instructions executed on each hart (i.e., the ISA is one -component of the EEI), and the handling of any interrupts or exceptions -raised during execution including environment calls. Examples of EEIs -include the Linux application binary interface (ABI), or the RISC-V -supervisor binary interface (SBI). The implementation of a RISC-V -execution environment can be pure hardware, pure software, or a -combination of hardware and software. For example, opcode traps and -software emulation can be used to implement functionality not provided -in hardware. Examples of execution environment implementations include:

-
-
-
    -
  • -

    "Bare metal" hardware platforms where harts are directly implemented -by physical processor threads and instructions have full access to the -physical address space. The hardware platform defines an execution -environment that begins at power-on reset.

    -
  • -
  • -

    RISC-V operating systems that provide multiple user-level execution -environments by multiplexing user-level harts onto available physical -processor threads and by controlling access to memory via virtual -memory.

    -
  • -
  • -

    RISC-V hypervisors that provide multiple supervisor-level execution -environments for guest operating systems.

    -
  • -
  • -

    RISC-V emulators, such as Spike, QEMU or rv8, which emulate RISC-V -harts on an underlying x86 system, and which can provide either a -user-level or a supervisor-level execution environment.

    -
  • -
-
-
- - - - - -
- - -
-

A bare hardware platform can be considered to define an EEI, where the -accessible harts, memory, and other devices populate the environment, -and the initial state is that at power-on reset. Generally, most -software is designed to use a more abstract interface to the hardware, -as more abstract EEIs provide greater portability across different -hardware platforms. Often EEIs are layered on top of one another, where -one higher-level EEI uses another lower-level EEI.

-
-
-
-
-

-From the perspective of software running in a given execution -environment, a hart is a resource that autonomously fetches and executes -RISC-V instructions within that execution environment. In this respect, -a hart behaves like a hardware thread resource even if time-multiplexed -onto real hardware by the execution environment. Some EEIs support the -creation and destruction of additional harts, for example, via -environment calls to fork new harts.

-
-
-

The execution environment is responsible for ensuring the eventual -forward progress of each of its harts. For a given hart, that -responsibility is suspended while the hart is exercising a mechanism -that explicitly waits for an event, such as the wait-for-interrupt -instruction defined in Volume II of this specification; and that -responsibility ends if the hart is terminated. The following events -constitute forward progress:

-
-
-
    -
  • -

    The retirement of an instruction.

    -
  • -
  • -

    A trap, as defined in Section 1.6.

    -
  • -
  • -

    Any other event defined by an extension to constitute forward -progress.

    -
  • -
-
-
- - - - - -
- - -
-

The term hart was introduced in the work on Lithe (Pan et al., 2009) and (Pan et al., 2010) to provide a term to -represent an abstract execution resource as opposed to a software thread -programming abstraction.

-
-
-

The important distinction between a hardware thread (hart) and a -software thread context is that the software running inside an execution -environment is not responsible for causing progress of each of its -harts; that is the responsibility of the outer execution environment. So -the environment’s harts operate like hardware threads from the -perspective of the software inside the execution environment.

-
-
-

An execution environment implementation might time-multiplex a set of -guest harts onto fewer host harts provided by its own execution -environment but must do so in a way that guest harts operate like -independent hardware threads. In particular, if there are more guest -harts than host harts then the execution environment must be able to -preempt the guest harts and must not wait indefinitely for guest -software on a guest hart to "yield" control of the guest hart.

-
-
-
-
-
-

1.3. RISC-V ISA Overview

-
-

A RISC-V ISA is defined as a base integer ISA, which must be present in -any implementation, plus optional extensions to the base ISA. The base -integer ISAs are very similar to that of the early RISC processors -except with no branch delay slots and with support for optional -variable-length instruction encodings. A base is carefully restricted to -a minimal set of instructions sufficient to provide a reasonable target -for compilers, assemblers, linkers, and operating systems (with -additional privileged operations), and so provides a convenient ISA and -software toolchain "skeleton" around which more customized processor -ISAs can be built.

-
-
-

Although it is convenient to speak of the RISC-V ISA, RISC-V is -actually a family of related ISAs, of which there are currently four -base ISAs. Each base integer instruction set is characterized by the -width of the integer registers and the corresponding size of the address -space and by the number of integer registers. There are two primary base -integer variants, RV32I and RV64I, described in -Chapter 2 and Chapter 4, which provide 32-bit -or 64-bit address spaces respectively. We use the term XLEN to refer to -the width of an integer register in bits (either 32 or 64). -Chapter 6 describes the RV32E and RV64E subset variants of the -RV32I or RV64I base instruction sets respectively, which have been added to support small -microcontrollers, and which have half the number of integer registers. -Chapter 8 sketches a future RV128I variant of the -base integer instruction set supporting a flat 128-bit address space -(XLEN=128). The base integer instruction sets use a two’s-complement -representation for signed integer values.

-
-
- - - - - -
- - -
-

Although 64-bit address spaces are a requirement for larger systems, we -believe 32-bit address spaces will remain adequate for many embedded and -client devices for decades to come and will be desirable to lower memory -traffic and energy consumption. In addition, 32-bit address spaces are -sufficient for educational purposes. A larger flat 128-bit address space -might eventually be required, so we ensured this could be accommodated -within the RISC-V ISA framework.

-
-
-
-
- - - - - -
- - -
-

The four base ISAs in RISC-V are treated as distinct base ISAs. A common -question is why is there not a single ISA, and in particular, why is -RV32I not a strict subset of RV64I? Some earlier ISA designs (SPARC, -MIPS) adopted a strict superset policy when increasing address space -size to support running existing 32-bit binaries on new 64-bit hardware.

-
-
-

The main advantage of explicitly separating base ISAs is that each base -ISA can be optimized for its needs without requiring to support all the -operations needed for other base ISAs. For example, RV64I can omit -instructions and CSRs that are only needed to cope with the narrower -registers in RV32I. The RV32I variants can use encoding space otherwise -reserved for instructions only required by wider address-space variants.

-
-
-

The main disadvantage of not treating the design as a single ISA is that -it complicates the hardware needed to emulate one base ISA on another -(e.g., RV32I on RV64I). However, differences in addressing and -illegal-instruction traps generally mean some mode switch would be required in -hardware in any case even with full superset instruction encodings, and -the different RISC-V base ISAs are similar enough that supporting -multiple versions is relatively low cost. Although some have proposed -that the strict superset design would allow legacy 32-bit libraries to -be linked with 64-bit code, this is impractical in practice, even with -compatible encodings, due to the differences in software calling -conventions and system-call interfaces.

-
-
-

The RISC-V privileged architecture provides fields in misa to control -the unprivileged ISA at each level to support emulating different base -ISAs on the same hardware. We note that newer SPARC and MIPS ISA -revisions have deprecated support for running 32-bit code unchanged on -64-bit systems.

-
-
-

A related question is why there is a different encoding for 32-bit adds -in RV32I (ADD) and RV64I (ADDW)? The ADDW opcode could be used for -32-bit adds in RV32I and ADDD for 64-bit adds in RV64I, instead of the -existing design which uses the same opcode ADD for 32-bit adds in RV32I -and 64-bit adds in RV64I with a different opcode ADDW for 32-bit adds in -RV64I. This would also be more consistent with the use of the same LW -opcode for 32-bit load in both RV32I and RV64I. The very first versions -of RISC-V ISA did have a variant of this alternate design, but the -RISC-V design was changed to the current choice in January 2011. Our -focus was on supporting 32-bit integers in the 64-bit ISA not on -providing compatibility with the 32-bit ISA, and the motivation was to -remove the asymmetry that arose from having not all opcodes in RV32I -have a *W suffix (e.g., ADDW, but AND not ANDW). In hindsight, this was -perhaps not well-justified and a consequence of designing both ISAs at -the same time as opposed to adding one later to sit on top of another, -and also from a belief we had to fold platform requirements into the ISA -spec which would imply that all the RV32I instructions would have been -required in RV64I. It is too late to change the encoding now, but this -is also of little practical consequence for the reasons stated above.

-
-
-

It has been noted we could enable the *W variants as an extension to -RV32I systems to provide a common encoding across RV64I and a future -RV32 variant.

-
-
-
-
-

RISC-V has been designed to support extensive customization and -specialization. Each base integer ISA can be extended with one or more -optional instruction-set extensions. An extension may be categorized as -either standard, custom, or non-conforming. For this purpose, we divide -each RISC-V instruction-set encoding space (and related encoding spaces -such as the CSRs) into three disjoint categories: standard, -reserved, and custom. Standard extensions and encodings are defined -by RISC-V International; any extensions not defined by RISC-V International are -non-standard. Each base ISA and its standard extensions use only -standard encodings, and shall not conflict with each other in their uses -of these encodings. Reserved encodings are currently not defined but are -saved for future standard extensions; once thus used, they become -standard encodings. Custom encodings shall never be used for standard -extensions and are made available for vendor-specific non-standard -extensions. Non-standard extensions are either custom extensions, that -use only custom encodings, or non-conforming extensions, that use any -standard or reserved encoding. Instruction-set extensions are generally -shared but may provide slightly different functionality depending on the -base ISA. Chapter 38 describes various ways -of extending the RISC-V ISA. We have also developed a naming convention -for RISC-V base instructions and instruction-set extensions, described -in detail in Chapter 39.

-
-
-

To support more general software development, a set of standard -extensions are defined to provide integer multiply/divide, atomic -operations, and single and double-precision floating-point arithmetic. -The base integer ISA is named "I" (prefixed by RV32 or RV64 depending -on integer register width), and contains integer computational -instructions, integer loads, integer stores, and control-flow -instructions. The standard integer multiplication and division extension -is named "M", and adds instructions to multiply and divide values held -in the integer registers. The standard atomic instruction extension, -denoted by "A", adds instructions that atomically read, modify, and -write memory for inter-processor synchronization. The standard -single-precision floating-point extension, denoted by "F", adds -floating-point registers, single-precision computational instructions, -and single-precision loads and stores. The standard double-precision -floating-point extension, denoted by "D", expands the floating-point -registers, and adds double-precision computational instructions, loads, -and stores. The standard "C" compressed instruction extension provides -narrower 16-bit forms of common instructions.

-
-
-

Beyond the base integer ISA and these standard extensions, we believe -it is rare that a new instruction will provide a significant benefit for -all applications, although it may be very beneficial for a certain -domain. As energy efficiency concerns are forcing greater -specialization, we believe it is important to simplify the required -portion of an ISA specification. Whereas other architectures usually -treat their ISA as a single entity, which changes to a new version as -instructions are added over time, RISC-V will endeavor to keep the base -and each standard extension constant over time, and instead layer new -instructions as further optional extensions. For example, the base -integer ISAs will continue as fully supported standalone ISAs, -regardless of any subsequent extensions.

-
-
-
-

1.4. Memory

-
-

A RISC-V hart has a single byte-addressable address space of -stem d27f663ab9e7364c59a70b83260dca56 bytes for all memory accesses. A word of -memory is defined as 32 bits (4 bytes). Correspondingly, a halfword is 16 bits (2 bytes), a -doubleword is 64 bits (8 bytes), and a quadword is 128 bits (16 bytes). The memory address space is -circular, so that the byte at address stem 52d22d5bedfb57a75620c1335aa01e21 is -adjacent to the byte at address zero. Accordingly, memory address -computations done by the hardware ignore overflow and instead wrap -around modulo stem d27f663ab9e7364c59a70b83260dca56.

-
-
-

The execution environment determines the mapping of hardware resources -into a hart’s address space. Different address ranges of a hart’s -address space may (1) contain main memory, or -(2) contain one or more I/O devices. Reads and writes of I/O devices -may have visible side effects, but accesses to main memory cannot. -Vacant address ranges are not a separate category but can be represented as -either main memory or I/O regions that are not accessible. -Although it is possible for the execution environment to call everything -in a hart’s address space an I/O device, it is usually expected that -some portion will be specified as main memory.

-
-
-

When a RISC-V platform has multiple harts, the address spaces of any two -harts may be entirely the same, or entirely different, or may be partly -different but sharing some subset of resources, mapped into the same or -different address ranges.

-
-
- - - - - -
- - -
-

For a purely "bare metal" environment, all harts may see an identical -address space, accessed entirely by physical addresses. However, when -the execution environment includes an operating system employing address -translation, it is common for each hart to be given a virtual address -space that is largely or entirely its own.

-
-
-
-
-

-
-
-

Executing each RISC-V machine instruction entails one or more memory -accesses, subdivided into implicit and explicit accesses. For each -instruction executed, an implicit memory read (instruction fetch) is -done to obtain the encoded instruction to execute. Many RISC-V -instructions perform no further memory accesses beyond instruction -fetch. Specific load and store instructions perform an explicit read -or write of memory at an address determined by the instruction. The -execution environment may dictate that instruction execution performs -other implicit memory accesses (such as to implement address -translation) beyond those documented for the unprivileged ISA.

-
-
-

The execution environment determines what portions of the -address space are accessible for each kind of memory access. For -example, the set of locations that can be implicitly read for -instruction fetch may or may not have any overlap with the set of -locations that can be explicitly read by a load instruction; and the set -of locations that can be explicitly written by a store instruction may -be only a subset of locations that can be read. Ordinarily, if an -instruction attempts to access memory at an inaccessible address, an -exception is raised for the instruction.

-
-
-

Except when specified otherwise, implicit reads that do not raise an -exception and that have no side effects may occur arbitrarily early and -speculatively, even before the machine could possibly prove that the -read will be needed. For instance, a valid implementation could attempt -to read all of main memory at the earliest opportunity, cache as many -fetchable (executable) bytes as possible for later instruction fetches, -and avoid reading main memory for instruction fetches ever again. To -ensure that certain implicit reads are ordered only after writes to the -same memory locations, software must execute specific fence or -cache-control instructions defined for this purpose (such as the FENCE.I -instruction defined in Chapter 6). -

-
-
-

The memory accesses (implicit or explicit) made by a hart may appear to -occur in a different order as perceived by another hart or by any other -agent that can access the same memory. This perceived reordering of -memory accesses is always constrained, however, by the applicable memory -consistency model. The default memory consistency model for RISC-V is -the RISC-V Weak Memory Ordering (RVWMO), defined in -Chapter 18 and in appendices. Optionally, -an implementation may adopt the stronger model of Total Store Ordering, -as defined in Chapter 19. The execution environment -may also add constraints that further limit the perceived reordering of -memory accesses. Since the RVWMO model is the weakest model allowed for -any RISC-V implementation, software written for this model is compatible -with the actual memory consistency rules of all RISC-V implementations. -As with implicit reads, software must execute fence or cache-control -instructions to ensure specific ordering of memory accesses beyond the -requirements of the assumed memory consistency model and execution -environment.

-
-
-
-

1.5. Base Instruction-Length Encoding

-
-

The base RISC-V ISA has fixed-length 32-bit instructions that must be -naturally aligned on 32-bit boundaries. However, the standard RISC-V -encoding scheme is designed to support ISA extensions with -variable-length instructions, where each instruction can be any number -of 16-bit instruction parcels in length and parcels are naturally -aligned on 16-bit boundaries. The standard compressed ISA extension -described in Chapter 28 reduces code size by -providing compressed 16-bit instructions and relaxes the alignment -constraints to allow all instructions (16 bit and 32 bit) to be aligned -on any 16-bit boundary to improve code density.

-
-
-

We use the term IALIGN (measured in bits) to refer to the -instruction-address alignment constraint the implementation enforces. -IALIGN is 32 bits in the base ISA, but some ISA extensions, including -the compressed ISA extension, relax IALIGN to 16 bits. IALIGN may not -take on any value other than 16 or 32. -

-
-
-

We use the term ILEN (measured in bits) to refer to the maximum -instruction length supported by an implementation, and which is always a -multiple of IALIGN. For implementations supporting only a base -instruction set, ILEN is 32 bits. Implementations supporting longer -instructions have larger values of ILEN.

-
-
-

Table 1 illustrates the standard -RISC-V instruction-length encoding convention. All the 32-bit -instructions in the base ISA have their lowest two bits set to 11. The -optional compressed 16-bit instruction-set extensions have their lowest -two bits equal to 00, 01, or 10.

-
-
-

1.5.1. Expanded Instruction-Length Encoding

-
-

A portion of the 32-bit instruction-encoding space has been tentatively -allocated for instructions longer than 32 bits. The entirety of this -space is reserved at this time, and the following proposal for encoding -instructions longer than 32 bits is not considered frozen. -

-
-
-

Standard instruction-set extensions encoded with more than 32 bits have -additional low-order bits set to 1, with the conventions for 48-bit -and 64-bit lengths shown in -Table 1. Instruction lengths -between 80 bits and 176 bits are encoded using a 3-bit field in bits -[14:12] giving the number of 16-bit words in addition to the first -5stem 715efe442a3a591fd63a05098adb4ab716-bit words. The encoding with bits [14:12] set to -"111" is reserved for future longer instruction encodings.

-
- - ------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 1. RISC-V instruction length encoding. Only the 16-bit and 32-bit encodings are considered frozen at this time.
xxxxxxxxxxxxxxaa16-bit (aa≠11)

xxxxxxxxxxxxxxxx

xxxxxxxxxxxbbb11

32-bit (bbb≠111)

stem 56ee3cdc7603fdb6d68d8551a34e9a36xxxx

xxxxxxxxxxxxxxxx

xxxxxxxxxx011111

48-bit

stem 56ee3cdc7603fdb6d68d8551a34e9a36xxxx

xxxxxxxxxxxxxxxx

xxxxxxxxx0111111

64-bit

stem 56ee3cdc7603fdb6d68d8551a34e9a36xxxx

xxxxxxxxxxxxxxxx

xnnnxxxxx1111111

(80+16*nnn)-bit, nnn≠111

stem 56ee3cdc7603fdb6d68d8551a34e9a36xxxx

xxxxxxxxxxxxxxxx

x111xxxxx1111111

Reserved for ≥192-bits

Byte Address:

base+4

base+2

base

-
- - - - - -
- - -
-

Given the code size and energy savings of a compressed format, we wanted -to build in support for a compressed format to the ISA encoding scheme -rather than adding this as an afterthought, but to allow simpler -implementations we didn’t want to make the compressed format mandatory. -We also wanted to optionally allow longer instructions to support -experimentation and larger instruction-set extensions. Although our -encoding convention required a tighter encoding of the core RISC-V ISA, -this has several beneficial effects. -

-
-
-

An implementation of the standard IMAFD ISA need only hold the -most-significant 30 bits in instruction caches (a 6.25% saving). On -instruction cache refills, any instructions encountered with either low -bit clear should be recoded into illegal 30-bit instructions before -storing in the cache to preserve illegal-instruction exception behavior.

-
-
-

Perhaps more importantly, by condensing our base ISA into a subset of -the 32-bit instruction word, we leave more space available for -non-standard and custom extensions. In particular, the base RV32I ISA -uses less than 1/8 of the encoding space in the 32-bit instruction word. -As described in Chapter 38, an implementation that does not require support -for the standard compressed instruction extension can map 3 additional non-conforming -30-bit instruction spaces into the 32-bit fixed-width format, while preserving -support for standard ≥32-bit instruction-set -extensions. Further, if the implementation also does not need -instructions >32-bits in length, it can recover a further -four major opcodes for non-conforming extensions.

-
-
-
-
-

Encodings with bits [15:0] all zeros are defined as illegal -instructions. These instructions are considered to be of minimal length: -16 bits if any 16-bit instruction-set extension is present, otherwise 32 -bits. The encoding with bits [ILEN-1:0] all ones is also illegal; this -instruction is considered to be ILEN bits long.

-
-
- - - - - -
- - -
-

We consider it a feature that any length of instruction containing all -zero bits is not legal, as this quickly traps erroneous jumps into -zeroed memory regions. Similarly, we also reserve the instruction -encoding containing all ones to be an illegal instruction, to catch the -other common pattern observed with unprogrammed non-volatile memory -devices, disconnected memory buses, or broken memory devices.

-
-
-

Software can rely on a naturally aligned 32-bit word containing zero to -act as an illegal instruction on all RISC-V implementations, to be used -by software where an illegal instruction is explicitly desired. Defining -a corresponding known illegal value for all ones is more difficult due -to the variable-length encoding. Software cannot generally use the -illegal value of ILEN bits of all 1s, as software might not know ILEN -for the eventual target machine (e.g., if software is compiled into a -standard binary library used by many different machines). Defining a -32-bit word of all ones as illegal was also considered, as all machines -must support a 32-bit instruction size, but this requires the -instruction-fetch unit on machines with ILEN >32 report an -illegal-instruction exception rather than an access-fault exception when -such an instruction borders a protection boundary, complicating -variable-instruction-length fetch and decode.

-
-
-
-
-

-RISC-V base ISAs have either little-endian or big-endian memory systems, -with the privileged architecture further defining bi-endian operation. -Instructions are stored in memory as a sequence of 16-bit little-endian -parcels, regardless of memory system endianness. Parcels forming one -instruction are stored at increasing halfword addresses, with the -lowest-addressed parcel holding the lowest-numbered bits in the -instruction specification. - -

-
-
- - - - - -
- - -
-

We originally chose little-endian byte ordering for the RISC-V memory -system because little-endian systems are currently dominant commercially -(all x86 systems; iOS, Android, and Windows for ARM). A minor point is -that we have also found little-endian memory systems to be more natural -for hardware designers. However, certain application areas, such as IP -networking, operate on big-endian data structures, and certain legacy -code bases have been built assuming big-endian processors, so we have -defined big-endian and bi-endian variants of RISC-V.

-
-
-

We have to fix the order in which instruction parcels are stored in -memory, independent of memory system endianness, to ensure that the -length-encoding bits always appear first in halfword address order. This -allows the length of a variable-length instruction to be quickly -determined by an instruction-fetch unit by examining only the first few -bits of the first 16-bit instruction parcel.

-
-
-

We further make the instruction parcels themselves little-endian to -decouple the instruction encoding from the memory system endianness -altogether. This design benefits both software tooling and bi-endian -hardware. Otherwise, for instance, a RISC-V assembler or disassembler -would always need to know the intended active endianness, despite that -in bi-endian systems, the endianness mode might change dynamically -during execution. In contrast, by giving instructions a fixed -endianness, it is sometimes possible for carefully written software to -be endianness-agnostic even in binary form, much like -position-independent code.

-
-
-

The choice to have instructions be only little-endian does have -consequences, however, for RISC-V software that encodes or decodes -machine instructions. Big-endian JIT compilers, for example, must swap -the byte order when storing to instruction memory.

-
-
-

Once we had decided to fix on a little-endian instruction encoding, this -naturally led to placing the length-encoding bits in the LSB positions -of the instruction format to avoid breaking up opcode fields.

-
-
-
-
-
-
-

1.6. Exceptions, Traps, and Interrupts

-
-

We use the term exception to refer to an unusual condition occurring -at run time associated with an instruction in the current RISC-V hart. -We use the term interrupt to refer to an external asynchronous event -that may cause a RISC-V hart to experience an unexpected transfer of -control. We use the term trap to refer to the transfer of control to a -trap handler caused by either an exception or an interrupt. - - -

-
-
-

The instruction descriptions in following chapters describe conditions -that can raise an exception during execution. The general behavior of -most RISC-V EEIs is that a trap to some handler occurs when an exception -is signaled on an instruction (except for floating-point exceptions, -which, in the standard floating-point extensions, do not cause traps). -The manner in which interrupts are generated, routed to, and enabled by -a hart depends on the EEI.

-
-
- - - - - -
- - -
-

Our use of "exception" and "trap" is compatible with that in the -IEEE-754 floating-point standard.

-
-
-
-
-

How traps are handled and made visible to software running on the hart -depends on the enclosing execution environment. From the perspective of -software running inside an execution environment, traps encountered by a -hart at runtime can have four different effects:

-
-
-
-
Contained Trap
-
-

The trap is visible to, and handled by, software running inside the -execution environment. For example, in an EEI providing both -supervisor and user mode on harts, an ECALL by a user-mode hart will -generally result in a transfer of control to a supervisor-mode handler -running on the same hart. Similarly, in the same environment, when a -hart is interrupted, an interrupt handler will be run in supervisor -mode on the hart.

-
-
Requested Trap
-
-

The trap is a synchronous exception that is an explicit call to the -execution environment requesting an action on behalf of software -inside the execution environment. An example is a system call. In this -case, execution may or may not resume on the hart after the requested -action is taken by the execution environment. For example, a system -call could remove the hart or cause an orderly termination of the -entire execution environment.

-
-
Invisible Trap
-
-

The trap is handled transparently by the execution environment and -execution resumes normally after the trap is handled. Examples include -emulating missing instructions, handling non-resident page faults in a -demand-paged virtual-memory system, or handling device interrupts for -a different job in a multiprogrammed machine. In these cases, the -software running inside the execution environment is not aware of the -trap (we ignore timing effects in these definitions).

-
-
Fatal Trap
-
-

The trap represents a fatal failure and causes the execution -environment to terminate execution. Examples include failing a -virtual-memory page-protection check or allowing a watchdog timer to -expire. Each EEI should define how execution is terminated and -reported to an external environment.

-
-
-
-
-

Table 2 shows the characteristics of each kind of trap.

-
- - ------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 2. Characteristics of traps
ContainedRequestedInvisibleFatal

Execution terminates

No

No1

No

Yes

Software is oblivious

No

No

Yes

Yes2

Handled by environment

No

Yes

Yes

Yes

-
-

1 Termination may be requested
-2 Imprecise fatal traps might be observable by software

-
-
-

The EEI defines for each trap whether it is handled precisely, though -the recommendation is to maintain preciseness where possible. Contained -and requested traps can be observed to be imprecise by software inside -the execution environment. Invisible traps, by definition, cannot be -observed to be precise or imprecise by software running inside the -execution environment. Fatal traps can be observed to be imprecise by -software running inside the execution environment, if known-errorful -instructions do not cause immediate termination.

-
-
-

Because this document describes unprivileged instructions, traps are -rarely mentioned. Architectural means to handle contained traps are -defined in the privileged architecture manual, along with other features -to support richer EEIs. Unprivileged instructions that are defined -solely to cause requested traps are documented here. Invisible traps -are, by their nature, out of scope for this document. Instruction -encodings that are not defined here and not defined by some other means -may cause a fatal trap.

-
-
-
-

1.7. UNSPECIFIED Behaviors and Values

-
-

The architecture fully describes what implementations must do and any -constraints on what they may do. In cases where the architecture -intentionally does not constrain implementations, the term UNSPECIFIED is -explicitly used. - -

-
-
-

The term UNSPECIFIED refers to a behavior or value that is intentionally -unconstrained. The definition of these behaviors or values is open to -extensions, platform standards, or implementations. Extensions, platform -standards, or implementation documentation may provide normative content -to further constrain cases that the base architecture defines as UNSPECIFIED.

-
-
-

Like the base architecture, extensions should fully describe allowable -behavior and values and use the term UNSPECIFIED for cases that are intentionally -unconstrained. These cases may be constrained or defined by other -extensions, platform standards, or implementations.

-
-
-
-
-
-

2. RV32I Base Integer Instruction Set, Version 2.1

-
-
-

This chapter describes the RV32I base integer instruction set.

-
-
- - - - - -
- - -
-

RV32I was designed to be sufficient to form a compiler target and to -support modern operating system environments. The ISA was also designed -to reduce the hardware required in a minimal implementation. RV32I -contains 40 unique instructions, though a simple implementation might -cover the ECALL/EBREAK instructions with a single SYSTEM hardware -instruction that always traps and might be able to implement the FENCE -instruction as a NOP, reducing base instruction count to 38 total. RV32I -can emulate almost any other ISA extension (except the A extension, -which requires additional hardware support for atomicity).

-
-
-

In practice, a hardware implementation including the machine-mode -privileged architecture will also require the 6 CSR instructions.

-
-
-

Subsets of the base integer ISA might be useful for pedagogical -purposes, but the base has been defined such that there should be little -incentive to subset a real hardware implementation beyond omitting -support for misaligned memory accesses and treating all SYSTEM -instructions as a single trap.

-
-
-
-
- - - - - -
- - -
-

The standard RISC-V assembly language syntax is documented in the -Assembly Programmer’s Manual (RISC-V Assembly Programmer’s Manual, n.d.).

-
-
-
-
- - - - - -
- - -
-

Most of the commentary for RV32I also applies to the RV64I base.

-
-
-
-
-

2.1. Programmers' Model for Base Integer ISA

-
-

Table 3 shows the unprivileged state for the base -integer ISA. For RV32I, the 32 x registers are each 32 bits wide, -i.e., XLEN=32. Register x0 is hardwired with all bits equal to 0. -General purpose registers x1-x31 hold values that various -instructions interpret as a collection of Boolean values, or as two’s -complement signed binary integers or unsigned binary integers.

-
-
-

There is one additional unprivileged register: the program counter pc -holds the address of the current instruction.

-
- - ----- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 3. RISC-V base unprivileged integer register state.
XLEN-10

x0/zero

x1

x2

x3

x4

x5

x6

x7

x8

x9

x10

x11

x12

x13

x14

x15

x16

x17

x18

x19

x20

x21

x22

x23

x24

x25

x26

x27

x28

x29

x30

x31

XLEN

XLEN-1

0

pc

XLEN

-
- - - - - -
- - -
-

There is no dedicated stack pointer or subroutine return address link -register in the Base Integer ISA; the instruction encoding allows any -x register to be used for these purposes. However, the standard -software calling convention uses register x1 to hold the return -address for a call, with register x5 available as an alternate link -register. The standard calling convention uses register x2 as the -stack pointer.

-
-
-

Hardware might choose to accelerate function calls and returns that use -x1 or x5. See the descriptions of the JAL and JALR instructions.

-
-
-

The optional compressed 16-bit instruction format is designed around the -assumption that x1 is the return address register and x2 is the -stack pointer. Software using other conventions will operate correctly -but may have greater code size.

-
-
-

The number of available architectural registers can have large impacts -on code size, performance, and energy consumption. Although 16 registers -would arguably be sufficient for an integer ISA running compiled code, -it is impossible to encode a complete ISA with 16 registers in 16-bit -instructions using a 3-address format. Although a 2-address format would -be possible, it would increase instruction count and lower efficiency. -We wanted to avoid intermediate instruction sizes (such as Xtensa’s -24-bit instructions) to simplify base hardware implementations, and once -a 32-bit instruction size was adopted, it was straightforward to support -32 integer registers. A larger number of integer registers also helps -performance on high-performance code, where there can be extensive use -of loop unrolling, software pipelining, and cache tiling.

-
-
-

For these reasons, we chose a conventional size of 32 integer registers -for RV32I. Dynamic register usage tends to be dominated by a few -frequently accessed registers, and regfile implementations can be -optimized to reduce access energy for the frequently accessed -registers (Tseng & Asanović, 2000). The optional compressed 16-bit instruction format mostly -only accesses 8 registers and hence can provide a dense instruction -encoding, while additional instruction-set extensions could support a -much larger register space (either flat or hierarchical) if desired.

-
-
-

For resource-constrained embedded applications, we have defined the -RV32E subset, which only has 16 registers -(Chapter 3).

-
-
-
-
-
-

2.2. Base Instruction Formats

-
-

In the base RV32I ISA, there are four core instruction formats -(R/I/S/U), as shown in Base instruction formats. All are a fixed 32 -bits in length. The base ISA has IALIGN=32, meaning that instructions must be aligned on a four-byte boundary in memory. An -instruction-address-misaligned exception is generated on a taken branch -or unconditional jump if the target address is not IALIGN-bit aligned. -This exception is reported on the branch or jump instruction, not on the -target instruction. No instruction-address-misaligned exception is -generated for a conditional branch that is not taken.

-
-
- - - - - -
- - -
-

The alignment constraint for base ISA instructions is relaxed to a -two-byte boundary when instruction extensions with 16-bit lengths or -other odd multiples of 16-bit lengths are added (i.e., IALIGN=16).

-
-
-

Instruction-address-misaligned exceptions are reported on the branch or -jump that would cause instruction misalignment to help debugging, and to -simplify hardware design for systems with IALIGN=32, where these are the -only places where misalignment can occur.

-
-
-
-
-

The behavior upon decoding a reserved instruction is UNSPECIFIED.

-
-
- - - - - -
- - -
-

Some platforms may require that opcodes reserved for standard use raise -an illegal-instruction exception. Other platforms may permit reserved -opcode space be used for non-conforming extensions.

-
-
-
-
-

The RISC-V ISA keeps the source (rs1 and rs2) and destination (rd) -registers at the same position in all formats to simplify decoding. -Except for the 5-bit immediates used in CSR instructions -(Chapter 7), immediates are always -sign-extended, and are generally packed towards the leftmost available -bits in the instruction and have been allocated to reduce hardware -complexity. In particular, the sign bit for all immediates is always in -bit 31 of the instruction to speed sign-extension circuitry.

-
-
-
-Diagram -
-
-
-
-Diagram -
-
-
-
-Diagram -
-
-
-
-Diagram -
-
-
-

RISC-V base instruction formats. Each immediate subfield is labeled with the bit position (imm[x]) in the immediate value being produced, rather than the bit position within the instruction’s immediate field as is usually done.

-
-
- - - - - -
- - -
-

Decoding register specifiers is usually on the critical paths in -implementations, and so the instruction format was chosen to keep all -register specifiers at the same position in all formats at the expense -of having to move immediate bits across formats (a property shared with -RISC-IV aka. SPUR (Lee et al., 1989)).

-
-
-

In practice, most immediates are either small or require all XLEN bits. -We chose an asymmetric immediate split (12 bits in regular instructions -plus a special load-upper-immediate instruction with 20 bits) to -increase the opcode space available for regular instructions.

-
-
-

Immediates are sign-extended because we did not observe a benefit to -using zero extension for some immediates as in the MIPS ISA and wanted -to keep the ISA as simple as possible.

-
-
-
-
-
-

2.3. Immediate Encoding Variants

-
-

There are a further two variants of the instruction formats (B/J) based -on the handling of immediates, as shown in Base instruction formats immediate variants..

-
-
-
-Diagram -
-
-
-
-Diagram -
-
-
-
-Diagram -
-
-
-
-Diagram -
-
-
-
-Diagram -
-
-
-
-Diagram -
-
-
-

The only difference between the S and B formats is that the 12-bit -immediate field is used to encode branch offsets in multiples of 2 in -the B format. Instead of shifting all bits in the instruction-encoded -immediate left by one in hardware as is conventionally done, the middle -bits (imm[10:1]) and sign bit stay in fixed positions, while the lowest -bit in S format (inst[7]) encodes a high-order bit in B format.

-
-
-

Similarly, the only difference between the U and J formats is that the -20-bit immediate is shifted left by 12 bits to form U immediates and by -1 bit to form J immediates. The location of instruction bits in the U -and J format immediates is chosen to maximize overlap with the other -formats and with each other.

-
-
-

Immediate types shows the immediates produced by -each of the base instruction formats, and is labeled to show which -instruction bit (inst[y]) produces each bit of the immediate value.

-
-
-
-Diagram -
-
Figure 1. Types of immediate produced by RISC-V instructions.
-
-
-
-Diagram -
-
-
-
-Diagram -
-
-
-
-Diagram -
-
-
-
-Diagram -
-
-
-

The fields are labeled with the instruction bits used to construct their value. Sign extensions always uses inst[31].

-
-
- - - - - -
- - -
-

Sign extension is one of the most critical operations on immediates -(particularly for XLEN>32), and in RISC-V the sign bit for -all immediates is always held in bit 31 of the instruction to allow -sign extension to proceed in parallel with instruction decoding.

-
-
-

Although more complex implementations might have separate adders for -branch and jump calculations and so would not benefit from keeping the -location of immediate bits constant across types of instruction, we -wanted to reduce the hardware cost of the simplest implementations. By -rotating bits in the instruction encoding of B and J immediates instead -of using dynamic hardware muxes to multiply the immediate by 2, we -reduce instruction signal fanout and immediate mux costs by around a -factor of 2. The scrambled immediate encoding will add negligible time -to static or ahead-of-time compilation. For dynamic generation of -instructions, there is some small additional overhead, but the most -common short forward branches have straightforward immediate encodings.

-
-
-
-
-
-

2.4. Integer Computational Instructions

-
-

Most integer computational instructions operate on XLEN bits of values -held in the integer register file. Integer computational instructions -are either encoded as register-immediate operations using the I-type -format or as register-register operations using the R-type format. The -destination is register rd for both register-immediate and -register-register instructions. No integer computational instructions -cause arithmetic exceptions.

-
-
- - - - - -
- - -
-

We did not include special instruction-set support for overflow checks -on integer arithmetic operations in the base instruction set, as many -overflow checks can be cheaply implemented using RISC-V branches. -Overflow checking for unsigned addition requires only a single -additional branch instruction after the addition: -add t0, t1, t2; bltu t0, t1, overflow.

-
-
-

For signed addition, if one operand’s sign is known, overflow checking -requires only a single branch after the addition: -addi t0, t1, +imm; blt t0, t1, overflow. This covers the common case -of addition with an immediate operand.

-
-
-

For general signed addition, three additional instructions after the -addition are required, leveraging the observation that the sum should be -less than one of the operands if and only if the other operand is -negative.

-
-
-
-
         add t0, t1, t2
-         slti t3, t2, 0
-         slt t4, t0, t1
-         bne t3, t4, overflow
-
-
-
-

In RV64I, checks of 32-bit signed additions can be optimized further by -comparing the results of ADD and ADDW on the operands.

-
-
-
-
-

2.4.1. Integer Register-Immediate Instructions

-
-
-Diagram -
-
-
-

ADDI adds the sign-extended 12-bit immediate to register rs1. -Arithmetic overflow is ignored and the result is simply the low XLEN -bits of the result. ADDI rd, rs1, 0 is used to implement the MV rd, -rs1 assembler pseudoinstruction.

-
-
-

SLTI (set less than immediate) places the value 1 in register rd if -register rs1 is less than the sign-extended immediate when both are -treated as signed numbers, else 0 is written to rd. SLTIU is similar -but compares the values as unsigned numbers (i.e., the immediate is -first sign-extended to XLEN bits then treated as an unsigned number). -Note, SLTIU rd, rs1, 1 sets rd to 1 if rs1 equals zero, otherwise -sets rd to 0 (assembler pseudoinstruction SEQZ rd, rs).

-
-
-

ANDI, ORI, XORI are logical operations that perform bitwise AND, OR, and -XOR on register rs1 and the sign-extended 12-bit immediate and place -the result in rd. Note, XORI rd, rs1, -1 performs a bitwise logical -inversion of register rs1 (assembler pseudoinstruction NOT rd, rs).

-
-
-
-Diagram -
-
-
-

Shifts by a constant are encoded as a specialization of the I-type -format. The operand to be shifted is in rs1, and the shift amount is -encoded in the lower 5 bits of the I-immediate field. The right shift -type is encoded in bit 30. SLLI is a logical left shift (zeros are -shifted into the lower bits); SRLI is a logical right shift (zeros are -shifted into the upper bits); and SRAI is an arithmetic right shift (the -original sign bit is copied into the vacated upper bits).

-
-
-
-Diagram -
-
-
-

LUI (load upper immediate) is used to build 32-bit constants and uses -the U-type format. LUI places the 32-bit U-immediate value into the -destination register rd, filling in the lowest 12 bits with zeros.

-
-
-

AUIPC (add upper immediate to pc) is used to build pc-relative -addresses and uses the U-type format. AUIPC forms a 32-bit offset from -the U-immediate, filling in the lowest 12 bits with zeros, adds this -offset to the address of the AUIPC instruction, then places the result -in register rd.

-
-
- - - - - -
- - -
-

The assembly syntax for lui and auipc does not represent the lower -12 bits of the U-immediate, which are always zero.

-
-
-

The AUIPC instruction supports two-instruction sequences to access -arbitrary offsets from the PC for both control-flow transfers and data -accesses. The combination of an AUIPC and the 12-bit immediate in a JALR -can transfer control to any 32-bit PC-relative address, while an AUIPC -plus the 12-bit immediate offset in regular load or store instructions -can access any 32-bit PC-relative data address.

-
-
-

The current PC can be obtained by setting the U-immediate to 0. Although -a JAL +4 instruction could also be used to obtain the local PC (of the -instruction following the JAL), it might cause pipeline breaks in -simpler microarchitectures or pollute BTB structures in more complex -microarchitectures.

-
-
-
-
-
-

2.4.2. Integer Register-Register Operations

-
-

RV32I defines several arithmetic R-type operations. All operations read -the rs1 and rs2 registers as source operands and write the result -into register rd. The funct7 and funct3 fields select the type of -operation.

-
-
-
-Diagram -
-
-
-

ADD performs the addition of rs1 and rs2. SUB performs the -subtraction of rs2 from rs1. Overflows are ignored and the low XLEN -bits of results are written to the destination rd. SLT and SLTU -perform signed and unsigned compares respectively, writing 1 to rd if -rs1 < rs2, 0 otherwise. Note, SLTU rd, x0, rs2 sets rd to 1 if -rs2 is not equal to zero, otherwise sets rd to zero (assembler -pseudoinstruction SNEZ rd, rs). AND, OR, and XOR perform bitwise -logical operations.

-
-
-

SLL, SRL, and SRA perform logical left, logical right, and arithmetic -right shifts on the value in register rs1 by the shift amount held in -the lower 5 bits of register rs2.

-
-
-
-

2.4.3. NOP Instruction

-
-
-Diagram -
-
-
-

The NOP instruction does not change any architecturally visible state, -except for advancing the pc and incrementing any applicable -performance counters. NOP is encoded as ADDI x0, x0, 0.

-
-
- - - - - -
- - -
-

NOPs can be used to align code segments to microarchitecturally -significant address boundaries, or to leave space for inline code -modifications. Although there are many possible ways to encode a NOP, we -define a canonical NOP encoding to allow microarchitectural -optimizations as well as for more readable disassembly output. The other -NOP encodings are made available for HINT Instructions.

-
-
-

ADDI was chosen for the NOP encoding as this is most likely to take -fewest resources to execute across a range of systems (if not optimized -away in decode). In particular, the instruction only reads one register. -Also, an ADDI functional unit is more likely to be available in a -superscalar design as adds are the most common operation. In particular, -address-generation functional units can execute ADDI using the same -hardware needed for base+offset address calculations, while -register-register ADD or logical/shift operations require additional -hardware.

-
-
-
-
-
-
-

2.5. Control Transfer Instructions

-
-

RV32I provides two types of control transfer instructions: unconditional -jumps and conditional branches. Control transfer instructions in RV32I -do not have architecturally visible delay slots.

-
-
-

If an instruction access-fault or instruction page-fault exception -occurs on the target of a jump or taken branch, the exception is -reported on the target instruction, not on the jump or branch -instruction.

-
-
-

2.5.1. Unconditional Jumps

-
-

The jump and link (JAL) instruction uses the J-type format, where the -J-immediate encodes a signed offset in multiples of 2 bytes. The offset -is sign-extended and added to the address of the jump instruction to -form the jump target address. Jumps can therefore target a -±1 MiB range. JAL stores the address of the instruction -following the jump ('pc'+4) into register rd. The standard software -calling convention uses 'x1' as the return address register and 'x5' as -an alternate link register.

-
-
- - - - - -
- - -
-

The alternate link register supports calling millicode routines (e.g., -those to save and restore registers in compressed code) while preserving -the regular return address register. The register x5 was chosen as the -alternate link register as it maps to a temporary in the standard -calling convention, and has an encoding that is only one bit different -than the regular link register.

-
-
-
-
-

Plain unconditional jumps (assembler pseudoinstruction J) are encoded as -a JAL with rd=x0.

-
-
-
-Diagram -
-
-
-

The indirect jump instruction JALR (jump and link register) uses the -I-type encoding. The target address is obtained by adding the -sign-extended 12-bit I-immediate to the register rs1, then setting the -least-significant bit of the result to zero. The address of the -instruction following the jump (pc+4) is written to register rd. -Register x0 can be used as the destination if the result is not -required.

-
-
-
-Diagram -
-
-
- - - - - -
- - -
-

The unconditional jump instructions all use PC-relative addressing to -help support position-independent code. The JALR instruction was defined -to enable a two-instruction sequence to jump anywhere in a 32-bit -absolute address range. A LUI instruction can first load rs1 with the -upper 20 bits of a target address, then JALR can add in the lower bits. -Similarly, AUIPC then JALR can jump anywhere in a 32-bit pc-relative -address range.

-
-
-

Note that the JALR instruction does not treat the 12-bit immediate as -multiples of 2 bytes, unlike the conditional branch instructions. This -avoids one more immediate format in hardware. In practice, most uses of -JALR will have either a zero immediate or be paired with a LUI or AUIPC, -so the slight reduction in range is not significant.

-
-
-

Clearing the least-significant bit when calculating the JALR target -address both simplifies the hardware slightly and allows the low bit of -function pointers to be used to store auxiliary information. Although -there is potentially a slight loss of error checking in this case, in -practice jumps to an incorrect instruction address will usually quickly -raise an exception.

-
-
-

When used with a base rs1=x0, JALR can be used to -implement a single instruction subroutine call to the lowest or highest -address region from anywhere in the address space, which could be used -to implement fast calls to a small runtime library. Alternatively, an -ABI could dedicate a general-purpose register to point to a library -elsewhere in the address space.

-
-
-
-
-

The JAL and JALR instructions will generate an -instruction-address-misaligned exception if the target address is not -aligned to a four-byte boundary.

-
-
- - - - - -
- - -
-

Instruction-address-misaligned exceptions are not possible on machines -that support extensions with 16-bit aligned instructions, such as the -compressed instruction-set extension, C.

-
-
-
-
-

Return-address prediction stacks are a common feature of -high-performance instruction-fetch units, but require accurate detection -of instructions used for procedure calls and returns to be effective. -For RISC-V, hints as to the instructions' usage are encoded implicitly -via the register numbers used. A JAL instruction should push the return -address onto a return-address stack (RAS) only when rd is 'x1' or -x5. JALR instructions should push/pop a RAS as shown in Table 4.

-
- - ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 4. Return-address stack prediction hints encoded in the register operands of a JALR instruction.
rd is x1/x5rs1 is x1/x5rd=rs1RAS action

No

No

 — 

None

No

Yes

 — 

Pop

Yes

No

 — 

Push

Yes

Yes

No

Pop, then push

Yes

Yes

Yes

Push

-
- - - - - -
- - -
-

Some other ISAs added explicit hint bits to their indirect-jump -instructions to guide return-address stack manipulation. We use implicit -hinting tied to register numbers and the calling convention to reduce -the encoding space used for these hints.

-
-
-

When two different link registers (x1 and x5) are given as rs1 and -rd, then the RAS is both popped and pushed to support coroutines. If -rs1 and rd are the same link register (either x1 or x5), the RAS -is only pushed to enable macro-op fusion of the sequences: -lui ra, imm20; jalr ra, imm12(ra)_ and _auipc ra, imm20; jalr ra, imm12(ra)

-
-
-
-
-
-

2.5.2. Conditional Branches

-
-

All branch instructions use the B-type instruction format. The 12-bit -B-immediate encodes signed offsets in multiples of 2 bytes. The offset -is sign-extended and added to the address of the branch instruction to -give the target address. The conditional branch range is -±4 KiB.

-
-
-
-Diagram -
-
-
-

Branch instructions compare two registers. BEQ and BNE take the branch -if registers rs1 and rs2 are equal or unequal respectively. BLT and -BLTU take the branch if rs1 is less than rs2, using signed and -unsigned comparison respectively. BGE and BGEU take the branch if rs1 -is greater than or equal to rs2, using signed and unsigned comparison -respectively. Note, BGT, BGTU, BLE, and BLEU can be synthesized by -reversing the operands to BLT, BLTU, BGE, and BGEU, respectively.

-
-
- - - - - -
- - -
-

Signed array bounds may be checked with a single BLTU instruction, since -any negative index will compare greater than any nonnegative bound.

-
-
-
-
-

Software should be optimized such that the sequential code path is the -most common path, with less-frequently taken code paths placed out of -line. Software should also assume that backward branches will be -predicted taken and forward branches as not taken, at least the first -time they are encountered. Dynamic predictors should quickly learn any -predictable branch behavior.

-
-
-

Unlike some other architectures, the RISC-V jump (JAL with rd=x0) -instruction should always be used for unconditional branches instead of -a conditional branch instruction with an always-true condition. RISC-V -jumps are also PC-relative and support a much wider offset range than -branches, and will not pollute conditional-branch prediction tables.

-
-
- - - - - -
- - -
-

The conditional branches were designed to include arithmetic comparison -operations between two registers (as also done in PA-RISC, Xtensa, and -MIPS R6), rather than use condition codes (x86, ARM, SPARC, PowerPC), or -to only compare one register against zero (Alpha, MIPS), or two -registers only for equality (MIPS). This design was motivated by the -observation that a combined compare-and-branch instruction fits into a -regular pipeline, avoids additional condition code state or use of a -temporary register, and reduces static code size and dynamic instruction -fetch traffic. Another point is that comparisons against zero require -non-trivial circuit delay (especially after the move to static logic in -advanced processes) and so are almost as expensive as arithmetic -magnitude compares. Another advantage of a fused compare-and-branch -instruction is that branches are observed earlier in the front-end -instruction stream, and so can be predicted earlier. There is perhaps an -advantage to a design with condition codes in the case where multiple -branches can be taken based on the same condition codes, but we believe -this case to be relatively rare.

-
-
-

We considered but did not include static branch hints in the instruction -encoding. These can reduce the pressure on dynamic predictors, but -require more instruction encoding space and software profiling for best -results, and can result in poor performance if production runs do not -match profiling runs.

-
-
-

We considered but did not include conditional moves or predicated -instructions, which can effectively replace unpredictable short forward -branches. Conditional moves are the simpler of the two, but are -difficult to use with conditional code that might cause exceptions -(memory accesses and floating-point operations). Predication adds -additional flag state to a system, additional instructions to set and -clear flags, and additional encoding overhead on every instruction. Both -conditional move and predicated instructions add complexity to -out-of-order microarchitectures, adding an implicit third source operand -due to the need to copy the original value of the destination -architectural register into the renamed destination physical register if -the predicate is false. Also, static compile-time decisions to use -predication instead of branches can result in lower performance on -inputs not included in the compiler training set, especially given that -unpredictable branches are rare, and becoming rarer as branch prediction -techniques improve.

-
-
-

We note that various microarchitectural techniques exist to dynamically -convert unpredictable short forward branches into internally predicated -code to avoid the cost of flushing pipelines on a branch mispredict (Heil & Smith, 1996), (Klauser et al., 1998), (Kim et al., 2005) and -have been implemented in commercial processors (Sinharoy et al., 2011). The simplest techniques -just reduce the penalty of recovering from a mispredicted short forward -branch by only flushing instructions in the branch shadow instead of the -entire fetch pipeline, or by fetching instructions from both sides using -wide instruction fetch or idle instruction fetch slots. More complex -techniques for out-of-order cores add internal predicates on -instructions in the branch shadow, with the internal predicate value -written by the branch instruction, allowing the branch and following -instructions to be executed speculatively and out-of-order with respect -to other code.

-
-
-
-
-

The conditional branch instructions will generate an -instruction-address-misaligned exception if the target address is not -aligned to a four-byte boundary and the branch condition evaluates to -true. If the branch condition evaluates to false, the -instruction-address-misaligned exception will not be raised.

-
-
- - - - - -
- - -
-

Instruction-address-misaligned exceptions are not possible on machines -that support extensions with 16-bit aligned instructions, such as the -compressed instruction-set extension, C.

-
-
-
-
-
-
-

2.6. Load and Store Instructions

-
-

RV32I is a load-store architecture, where only load and store -instructions access memory and arithmetic instructions only operate on -CPU registers. RV32I provides a 32-bit address space that is -byte-addressed. The EEI will define what portions of the address space -are legal to access with which instructions (e.g., some addresses might -be read only, or support word access only). Loads with a destination of -x0 must still raise any exceptions and cause any other side effects -even though the load value is discarded.

-
-
-

The EEI will define whether the memory system is little-endian or -big-endian. In RISC-V, endianness is byte-address invariant.

-
-
- - - - - -
- - -
-

In a system for which endianness is byte-address invariant, the -following property holds: if a byte is stored to memory at some address -in some endianness, then a byte-sized load from that address in any -endianness returns the stored value.

-
-
-

In a little-endian configuration, multibyte stores write the -least-significant register byte at the lowest memory byte address, -followed by the other register bytes in ascending order of their -significance. Loads similarly transfer the contents of the lesser memory -byte addresses to the less-significant register bytes.

-
-
-

In a big-endian configuration, multibyte stores write the -most-significant register byte at the lowest memory byte address, -followed by the other register bytes in descending order of their -significance. Loads similarly transfer the contents of the greater -memory byte addresses to the less-significant register bytes.

-
-
-
-
-
-Diagram -
-
-
-
-Diagram -
-
-
-

Load and store instructions transfer a value between the registers and -memory. Loads are encoded in the I-type format and stores are S-type. -The effective address is obtained by adding register rs1 to the -sign-extended 12-bit offset. Loads copy a value from memory to register -rd. Stores copy the value in register rs2 to memory.

-
-
-

The LW instruction loads a 32-bit value from memory into rd. LH loads -a 16-bit value from memory, then sign-extends to 32-bits before storing -in rd. LHU loads a 16-bit value from memory but then zero extends to -32-bits before storing in rd. LB and LBU are defined analogously for -8-bit values. The SW, SH, and SB instructions store 32-bit, 16-bit, and -8-bit values from the low bits of register rs2 to memory.

-
-
-

Regardless of EEI, loads and stores whose effective addresses are -naturally aligned shall not raise an address-misaligned exception. Loads -and stores whose effective address is not naturally aligned to the -referenced datatype (i.e., the effective address is not divisible by the -size of the access in bytes) have behavior dependent on the EEI.

-
-
-

An EEI may guarantee that misaligned loads and stores are fully -supported, and so the software running inside the execution environment -will never experience a contained or fatal address-misaligned trap. In -this case, the misaligned loads and stores can be handled in hardware, -or via an invisible trap into the execution environment implementation, -or possibly a combination of hardware and invisible trap depending on -address.

-
-
-

An EEI may not guarantee misaligned loads and stores are handled -invisibly. In this case, loads and stores that are not naturally aligned -may either complete execution successfully or raise an exception. The -exception raised can be either an address-misaligned exception or an -access-fault exception. For a memory access that would otherwise be able -to complete except for the misalignment, an access-fault exception can -be raised instead of an address-misaligned exception if the misaligned -access should not be emulated, e.g., if accesses to the memory region -have side effects. When an EEI does not guarantee misaligned loads and -stores are handled invisibly, the EEI must define if exceptions caused -by address misalignment result in a contained trap (allowing software -running inside the execution environment to handle the trap) or a fatal -trap (terminating execution).

-
-
- - - - - -
- - -
-

Misaligned accesses are occasionally required when porting legacy code, -and help performance on applications when using any form of packed-SIMD -extension or handling externally packed data structures. Our rationale -for allowing EEIs to choose to support misaligned accesses via the -regular load and store instructions is to simplify the addition of -misaligned hardware support. One option would have been to disallow -misaligned accesses in the base ISAs and then provide some separate ISA -support for misaligned accesses, either special instructions to help -software handle misaligned accesses or a new hardware addressing mode -for misaligned accesses. Special instructions are difficult to use, -complicate the ISA, and often add new processor state (e.g., SPARC VIS -align address offset register) or complicate access to existing -processor state (e.g., MIPS LWL/LWR partial register writes). In -addition, for loop-oriented packed-SIMD code, the extra overhead when -operands are misaligned motivates software to provide multiple forms of -loop depending on operand alignment, which complicates code generation -and adds to loop startup overhead. New misaligned hardware addressing -modes take considerable space in the instruction encoding or require -very simplified addressing modes (e.g., register indirect only).

-
-
-
-
-

Even when misaligned loads and stores complete successfully, these -accesses might run extremely slowly depending on the implementation -(e.g., when implemented via an invisible trap). Furthermore, whereas -naturally aligned loads and stores are guaranteed to execute atomically, -misaligned loads and stores might not, and hence require additional -synchronization to ensure atomicity.

-
-
- - - - - -
- - -
-

We do not mandate atomicity for misaligned accesses so execution -environment implementations can use an invisible machine trap and a -software handler to handle some or all misaligned accesses. If hardware -misaligned support is provided, software can exploit this by simply -using regular load and store instructions. Hardware can then -automatically optimize accesses depending on whether runtime addresses -are aligned.

-
-
-
-
-
-

2.7. Memory Ordering Instructions

-
-
-mem order -
-
-
-

The FENCE instruction is used to order device I/O and memory accesses as -viewed by other RISC-V harts and external devices or coprocessors. Any -combination of device input (I), device output (O), memory reads (R), -and memory writes (W) may be ordered with respect to any combination of -the same. Informally, no other RISC-V hart or external device can -observe any operation in the successor set following a FENCE before -any operation in the predecessor set preceding the FENCE. -Chapter 18 provides a precise description -of the RISC-V memory consistency model.

-
-
-

The FENCE instruction also orders memory reads and writes made by the -hart as observed by memory reads and writes made by an external device. -However, FENCE does not order observations of events made by an external -device using any other signaling mechanism.

-
-
- - - - - -
- - -
-

A device might observe an access to a memory location via some external -communication mechanism, e.g., a memory-mapped control register that -drives an interrupt signal to an interrupt controller. This -communication is outside the scope of the FENCE ordering mechanism and -hence the FENCE instruction can provide no guarantee on when a change in -the interrupt signal is visible to the interrupt controller. Specific -devices might provide additional ordering guarantees to reduce software -overhead but those are outside the scope of the RISC-V memory model.

-
-
-
-
-

The EEI will define what I/O operations are possible, and in particular, -which memory addresses when accessed by load and store instructions will -be treated and ordered as device input and device output operations -respectively rather than memory reads and writes. For example, -memory-mapped I/O devices will typically be accessed with uncached loads -and stores that are ordered using the I and O bits rather than the R and -W bits. Instruction-set extensions might also describe new I/O -instructions that will also be ordered using the I and O bits in a -FENCE.

-
- - ----- - - - - - - - - - - - - - - - - - - - - - - - -
Table 5. Fence mode encoding
fm fieldMnemonicMeaning

0000

none

Normal Fence

1000

TSO

With FENCE RW,RW: exclude write-to-read ordering; otherwise: Reserved for future use.

other

Reserved for future use.

-
-

The fence mode field fm defines the semantics of the FENCE. A FENCE -with fm=0000 orders all memory operations in its predecessor set -before all memory operations in its successor set.

-
-
-

The FENCE.TSO instruction is encoded as a FENCE instruction -with fm=1000, predecessor=RW, and successor=RW. FENCE.TSO orders -all load operations in its predecessor set before all memory operations -in its successor set, and all store operations in its predecessor set -before all store operations in its successor set. This leaves non-AMO -store operations in the FENCE.TSO’s predecessor set unordered with -non-AMO loads in its successor set.

-
-
- - - - - -
- - -
-

Because FENCE RW,RW imposes a superset of the orderings that FENCE.TSO -imposes, it is correct to ignore the fm field and implement FENCE.TSO as FENCE RW,RW.

-
-
-
-
-

The unused fields in the FENCE instructions--rs1 and rd--are reserved -for finer-grain fences in future extensions. For forward compatibility, -base implementations shall ignore these fields, and standard software -shall zero these fields. Likewise, many fm and predecessor/successor -set settings in Table 5 are also reserved for future use. -Base implementations shall treat all such reserved configurations as -normal fences with fm=0000, and standard software shall use only -non-reserved configurations.

-
-
- - - - - -
- - -
-

We chose a relaxed memory model to allow high performance from simple -machine implementations and from likely future coprocessor or -accelerator extensions. We separate out I/O ordering from memory R/W -ordering to avoid unnecessary serialization within a device-driver hart -and also to support alternative non-memory paths to control added -coprocessors or I/O devices. Simple implementations may additionally -ignore the predecessor and successor fields and always execute a -conservative fence on all operations.

-
-
-
-
-
-

2.8. Environment Call and Breakpoints

-
-

SYSTEM instructions are used to access system functionality that might -require privileged access and are encoded using the I-type instruction -format. These can be divided into two main classes: those that -atomically read-modify-write control and status registers (CSRs), and -all other potentially privileged instructions. CSR instructions are -described in Chapter 7, and the base -unprivileged instructions are described in the following section.

-
-
- - - - - -
- - -
-

The SYSTEM instructions are defined to allow simpler implementations to -always trap to a single software trap handler. More sophisticated -implementations might execute more of each system instruction in -hardware.

-
-
-
-
-
-Diagram -
-
-
-

These two instructions cause a precise requested trap to the supporting -execution environment.

-
-
-

The ECALL instruction is used to make a service request to the execution -environment. The EEI will define how parameters for the service request -are passed, but usually these will be in defined locations in the -integer register file.

-
-
-

The EBREAK instruction is used to return control to a debugging -environment.

-
-
- - - - - -
- - -
-

ECALL and EBREAK were previously named SCALL and SBREAK. The -instructions have the same functionality and encoding, but were renamed -to reflect that they can be used more generally than to call a -supervisor-level operating system or debugger.

-
-
-
-
- - - - - -
- - -
-

EBREAK was primarily designed to be used by a debugger to cause -execution to stop and fall back into the debugger. EBREAK is also used -by the standard gcc compiler to mark code paths that should not be -executed.

-
-
-

Another use of EBREAK is to support "semihosting", where the execution -environment includes a debugger that can provide services over an -alternate system call interface built around the EBREAK instruction. -Because the RISC-V base ISAs do not provide more than one EBREAK -instruction, RISC-V semihosting uses a special sequence of instructions -to distinguish a semihosting EBREAK from a debugger inserted EBREAK.

-
-
-
-
    slli x0, x0, 0x1f   # Entry NOP
-    ebreak              # Break to debugger
-    srai x0, x0, 7      # NOP encoding the semihosting call number 7
-
-
-
-

Note that these three instructions must be 32-bit-wide instructions, -i.e., they mustn’t be among the compressed 16-bit instructions described -in Chapter 28.

-
-
-

The shift NOP instructions are still considered available for use as -HINTs.

-
-
-

Semihosting is a form of service call and would be more naturally -encoded as an ECALL using an existing ABI, but this would require the -debugger to be able to intercept ECALLs, which is a newer addition to -the debug standard. We intend to move over to using ECALLs with a -standard ABI, in which case, semihosting can share a service ABI with an -existing standard.

-
-
-

We note that ARM processors have also moved to using SVC instead of BKPT -for semihosting calls in newer designs.

-
-
-
-
-
-

2.9. HINT Instructions

-
-

RV32I reserves a large encoding space for HINT instructions, which are -usually used to communicate performance hints to the microarchitecture. -Like the NOP instruction, HINTs do not change any architecturally -visible state, except for advancing the pc and any applicable -performance counters. Implementations are always allowed to ignore the -encoded hints.

-
-
-

Most RV32I HINTs are encoded as integer computational instructions with -rd=x0. The other RV32I HINTs are encoded as FENCE instructions with -a null predecessor or successor set and with fm=0.

-
-
- - - - - -
- - -
-

These HINT encodings have been chosen so that simple implementations can -ignore HINTs altogether, and instead execute a HINT as a regular -instruction that happens not to mutate the architectural state. For -example, ADD is a HINT if the destination register is x0; the five-bit -rs1 and rs2 fields encode arguments to the HINT. However, a simple -implementation can simply execute the HINT as an ADD of rs1 and rs2 -that writes x0, which has no architecturally visible effect.

-
-
-

As another example, a FENCE instruction with a zero pred field and a -zero fm field is a HINT; the succ, rs1, and rd fields encode the -arguments to the HINT. A simple implementation can simply execute the -HINT as a FENCE that orders the null set of prior memory accesses before -whichever subsequent memory accesses are encoded in the succ field. -Since the intersection of the predecessor and successor sets is null, -the instruction imposes no memory orderings, and so it has no -architecturally visible effect.

-
-
-
-
-

Table 6 lists all RV32I HINT code points. 91% of the -HINT space is reserved for standard HINTs. The remainder of the HINT -space is designated for custom HINTs: no standard HINTs will ever be -defined in this subspace.

-
-
- - - - - -
- - -
-

We anticipate standard hints to eventually include memory-system spatial -and temporal locality hints, branch prediction hints, thread-scheduling -hints, security tags, and instrumentation flags for simulation/emulation.

-
-
-
- - ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 6. RV32I HINT instructions.
InstructionConstraintsCode PointsPurpose

LUI

rd=x0

stem 3d91b2575aebc0d742415f31a7138964

Designated for future standard use

AUIPC

rd=x0

stem 3d91b2575aebc0d742415f31a7138964

ADDI

rd=x0, and either rs1x0 or imm≠0

stem 0f73b0acd2f2b2250dca11c3c9ec1713

ANDI

rd=x0

stem cec3b648d9a79b7951e2288a4cb712cd

ORI

rd=x0

stem cec3b648d9a79b7951e2288a4cb712cd

XORI

rd=x0

stem cec3b648d9a79b7951e2288a4cb712cd

ADD

rd=x0, rs1x0

stem e68410fd305668d5d27741524c852b69

ADD

rd=x0, rs1=x0, rs2x2-x5

28

ADD

rd=x0, rs1=x0, rs2=x2-x5

4

(rs2=x2) NTL.P1
-(rs2=x3) NTL.PALL
-(rs2=x4) NTL.S1
-(rs2=x5) NTL.ALL

SUB

rd=x0

stem 560d002b3cbfbdc26102752336160ae5

Designated for future standard use

AND

rd=x0

stem 560d002b3cbfbdc26102752336160ae5

OR

rd=x0

stem 560d002b3cbfbdc26102752336160ae5

XOR

rd=x0

stem 560d002b3cbfbdc26102752336160ae5

SLL

rd=x0

stem 560d002b3cbfbdc26102752336160ae5

SRL

rd=x0

stem 560d002b3cbfbdc26102752336160ae5

SRA

rd=x0

stem 560d002b3cbfbdc26102752336160ae5

FENCE

rd=x0, rs1x0, fm=0, and either pred=0 or succ=0

stem 3be453f8a29ef791f6184cf706174789

FENCE

rdx0, rs1=x0, fm=0, and either pred=0 or succ=0

stem 3be453f8a29ef791f6184cf706174789

FENCE

rd=rs1=x0, fm=0, pred=0, succ≠0

15

FENCE

rd=rs1=x0, fm=0, pred≠W, succ=0

15

FENCE

rd=rs1=x0, fm=0, pred=W, succ=0

1

PAUSE

SLTI

rd=x0

stem cec3b648d9a79b7951e2288a4cb712cd

Designated for custom use

SLTIU

rd=x0

stem cec3b648d9a79b7951e2288a4cb712cd

SLLI

rd=x0

stem 560d002b3cbfbdc26102752336160ae5

SRLI

rd=x0

stem 560d002b3cbfbdc26102752336160ae5

SRAI

rd=x0

stem 560d002b3cbfbdc26102752336160ae5

SLT

rd=x0

stem 560d002b3cbfbdc26102752336160ae5

SLTU

rd=x0

stem 560d002b3cbfbdc26102752336160ae5

-
-
-
-
-

3. RV32E and RV64E Base Integer Instruction Sets, Version 2.0

-
- -
-
-
-

4. RV64I Base Integer Instruction Set, Version 2.1

-
-
-

This chapter describes the RV64I base integer instruction set, which -builds upon the RV32I variant described in Chapter 2. -This chapter presents only the differences with RV32I, so should be read -in conjunction with the earlier chapter.

-
-
-

4.1. Register State

-
-

RV64I widens the integer registers and supported user address space to -64 bits (XLEN=64 in Table 3).

-
-
-
-

4.2. Integer Computational Instructions

-
-

Most integer computational instructions operate on XLEN-bit values. -Additional instruction variants are provided to manipulate 32-bit values -in RV64I, indicated by a 'W' suffix to the opcode. These "*W" -instructions ignore the upper 32 bits of their inputs and always produce -32-bit signed values, sign-extending them to 64 bits, i.e. bits XLEN-1 -through 31 are equal.

-
-
-

-

-
-
-

4.2.1. Integer Register-Immediate Instructions

-
-
-Diagram -
-
-
-

ADDIW is an RV64I instruction that adds the sign-extended 12-bit -immediate to register rs1 and produces the proper sign extension of a -32-bit result in rd. Overflows are ignored and the result is the low -32 bits of the result sign-extended to 64 bits. Note, ADDIW rd, rs1, 0 -writes the sign extension of the lower 32 bits of register rs1 into -register rd (assembler pseudoinstruction SEXT.W).

-
-
-
-Diagram -
-
-
-

Shifts by a constant are encoded as a specialization of the I-type -format using the same instruction opcode as RV32I. The operand to be -shifted is in rs1, and the shift amount is encoded in the lower 6 bits -of the I-immediate field for RV64I. The right shift type is encoded in -bit 30. SLLI is a logical left shift (zeros are shifted into the lower -bits); SRLI is a logical right shift (zeros are shifted into the upper -bits); and SRAI is an arithmetic right shift (the original sign bit is -copied into the vacated upper bits). - - - -

-
-
-
-Diagram -
-
-
-

SLLIW, SRLIW, and SRAIW are RV64I-only instructions that are analogously -defined but operate on 32-bit values and sign-extend their 32-bit -results to 64 bits. SLLIW, SRLIW, and SRAIW encodings with -imm[5] ≠ 0 are reserved.

-
-
-
-Diagram -
-
-
-

LUI (load upper immediate) uses the same opcode as RV32I. LUI places the -32-bit U-immediate into register rd, filling in the lowest 12 bits -with zeros. The 32-bit result is sign-extended to 64 bits. -

-
-
-

AUIPC (add upper immediate to pc) uses the same opcode as RV32I. AUIPC -is used to build pc-relative addresses and uses the U-type format. -AUIPC forms a 32-bit offset from the U-immediate, filling in the lowest -12 bits with zeros, sign-extends the result to 64 bits, adds it to the -address of the AUIPC instruction, then places the result in register -rd.

-
-
-
-

4.2.2. Integer Register-Register Operations

-
-
-Diagram -
-
-
-

ADDW and SUBW are RV64I-only instructions that are defined analogously -to ADD and SUB but operate on 32-bit values and produce signed 32-bit -results. Overflows are ignored, and the low 32-bits of the result is -sign-extended to 64-bits and written to the destination register. - -

-
-
-

SLL, SRL, and SRA perform logical left, logical right, and arithmetic -right shifts on the value in register rs1 by the shift amount held in -register rs2. In RV64I, only the low 6 bits of rs2 are considered -for the shift amount.

-
-
-

SLLW, SRLW, and SRAW are RV64I-only instructions that are analogously -defined but operate on 32-bit values and sign-extend their 32-bit -results to 64 bits. The shift amount is given by rs2[4:0]. - - -

-
-
-
-
-

4.3. Load and Store Instructions

-
-

RV64I extends the address space to 64 bits. The execution environment -will define what portions of the address space are legal to access.

-
-
-
-Diagram -
-
-
-
-Diagram -
-
-
-

The LD instruction loads a 64-bit value from memory into register rd -for RV64I. -

-
-
-

The LW instruction loads a 32-bit value from memory and sign-extends -this to 64 bits before storing it in register rd for RV64I. The LWU -instruction, on the other hand, zero-extends the 32-bit value from -memory for RV64I. LH and LHU are defined analogously for 16-bit values, -as are LB and LBU for 8-bit values. The SD, SW, SH, and SB instructions -store 64-bit, 32-bit, 16-bit, and 8-bit values from the low bits of -register rs2 to memory respectively.

-
-
-
-

4.4. HINT Instructions

-
-

All instructions that are microarchitectural HINTs in RV32I (see -Chapter 2) are also HINTs in RV64I. -The additional computational instructions in RV64I expand both the -standard and custom HINT encoding spaces. -

-
-
-

Table 7 lists all RV64I HINT code points. 91% of the -HINT space is reserved for standard HINTs, but none are presently -defined. The remainder of the HINT space is designated for custom HINTs; -no standard HINTs will ever be defined in this subspace.

-
- - ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 7. RV64I HINT instructions.
InstructionConstraintsCode PointsPurpose

LUI

rd=x0

stem 3d91b2575aebc0d742415f31a7138964

Designated for future standard use

AUIPC

rd=x0

stem 3d91b2575aebc0d742415f31a7138964

ADDI

rd=x0, and either rs1x0 or imm≠0

stem 0f73b0acd2f2b2250dca11c3c9ec1713

ANDI

rd=x0

stem cec3b648d9a79b7951e2288a4cb712cd

ORI

rd=x0

stem cec3b648d9a79b7951e2288a4cb712cd

XORI

rd=x0

stem cec3b648d9a79b7951e2288a4cb712cd

ADDIW

rd=x0

stem cec3b648d9a79b7951e2288a4cb712cd

ADD

rd=x0, rs1x0

stem e68410fd305668d5d27741524c852b69

ADD

rd=x0, rs1=x0, rs2x2-x5

28

ADD

rd=x0, rs1=x0, rs2=x2-x5

4

(rs2=x2) NTL.P1
- (rs2=x3) NTL.PALL
- (rs2=x4) NTL.S1
- (rs2=x5) NTL.ALL

SUB

rd=x0

stem 560d002b3cbfbdc26102752336160ae5

Designated for future standard use

AND

rd=x0

stem 560d002b3cbfbdc26102752336160ae5

OR

rd=x0

stem 560d002b3cbfbdc26102752336160ae5

XOR

rd=x0

stem 560d002b3cbfbdc26102752336160ae5

SLL

rd=x0

stem 560d002b3cbfbdc26102752336160ae5

SRL

rd=x0

stem 560d002b3cbfbdc26102752336160ae5

SRA

rd=x0

stem 560d002b3cbfbdc26102752336160ae5

ADDW

rd=x0

stem 560d002b3cbfbdc26102752336160ae5

SUBW

rd=x0

stem 560d002b3cbfbdc26102752336160ae5

SLLW

rd=x0

stem 560d002b3cbfbdc26102752336160ae5

SRLW

rd=x0

stem 560d002b3cbfbdc26102752336160ae5

SRAW

rd=x0

stem 560d002b3cbfbdc26102752336160ae5

FENCE

rd=x0, rs1x0,fm=0, and either pred=0 or succ=0

stem 3be453f8a29ef791f6184cf706174789

FENCE

rdx0, rs1=x0, fm=0, and either pred=0 or succ=0

stem 3be453f8a29ef791f6184cf706174789

FENCE

rd=rs1=x0, fm=0, pred=0, succ≠0

15

FENCE

pred=0 or succ=0, pred≠W, succ =0

15

FENCE

rd=rs1=x0, fm=0, pred=W, succ=0

1

PAUSE

SLTI

rd=x0

stem cec3b648d9a79b7951e2288a4cb712cd

Designated for custom use

SLTIU

rd=x0

stem cec3b648d9a79b7951e2288a4cb712cd

SLLI

rd=x0

stem c3cdedaa1942f20bc7b5004e6620ff80

SRLI

rd=x0

stem c3cdedaa1942f20bc7b5004e6620ff80

SRAI

rd=x0

stem c3cdedaa1942f20bc7b5004e6620ff80

SLLIW

rd=x0

stem 560d002b3cbfbdc26102752336160ae5

SRLIW

rd=x0

stem 560d002b3cbfbdc26102752336160ae5

SRAIW

rd=x0

stem 560d002b3cbfbdc26102752336160ae5

SLT

rd=x0

stem 560d002b3cbfbdc26102752336160ae5

SLTU

rd=x0

stem 560d002b3cbfbdc26102752336160ae5

-
-
-
-
-

5. RV128I Base Integer Instruction Set, Version 1.7

-
-
-

CV64A6_MMU: This instruction set is not supported.

-
-
-
-
-

6. "Zifencei" Extension for Instruction-Fetch Fence, Version 2.0

-
- -
-
-
-

7. "Zicsr", Extension for Control and Status Register (CSR) Instructions, Version 2.0

-
-
-

RISC-V defines a separate address space of 4096 Control and Status -registers associated with each hart. This chapter defines the full set -of CSR instructions that operate on these CSRs.

-
-
- - - - - -
- - -
-

While CSRs are primarily used by the privileged architecture, there are -several uses in unprivileged code including for counters and timers, and -for floating-point status.

-
-
-

The counters and timers are no longer considered mandatory parts of the -standard base ISAs, and so the CSR instructions required to access them -have been moved out of Chapter 2 into this separate -chapter.

-
-
-
-
-

7.1. CSR Instructions

-
-

All CSR instructions atomically read-modify-write a single CSR, whose -CSR specifier is encoded in the 12-bit csr field of the instruction -held in bits 31-20. The immediate forms use a 5-bit zero-extended -immediate encoded in the rs1 field.

-
-
-
-Diagram -
-
-
-

The CSRRW (Atomic Read/Write CSR) instruction atomically swaps values in -the CSRs and integer registers. CSRRW reads the old value of the CSR, -zero-extends the value to XLEN bits, then writes it to integer register -rd. The initial value in rs1 is written to the CSR. If rd=x0, -then the instruction shall not read the CSR and shall not cause any of -the side effects that might occur on a CSR read.

-
-
-

The CSRRS (Atomic Read and Set Bits in CSR) instruction reads the value -of the CSR, zero-extends the value to XLEN bits, and writes it to -integer register rd. The initial value in integer register rs1 is -treated as a bit mask that specifies bit positions to be set in the CSR. -Any bit that is high in rs1 will cause the corresponding bit to be set -in the CSR, if that CSR bit is writable.

-
-
-

The CSRRC (Atomic Read and Clear Bits in CSR) instruction reads the -value of the CSR, zero-extends the value to XLEN bits, and writes it to -integer register rd. The initial value in integer register rs1 is -treated as a bit mask that specifies bit positions to be cleared in the -CSR. Any bit that is high in rs1 will cause the corresponding bit to -be cleared in the CSR, if that CSR bit is writable.

-
-
-

For both CSRRS and CSRRC, if rs1=x0, then the instruction will not -write to the CSR at all, and so shall not cause any of the side effects -that might otherwise occur on a CSR write, nor raise illegal-instruction -exceptions on accesses to read-only CSRs. Both CSRRS and CSRRC always -read the addressed CSR and cause any read side effects regardless of -rs1 and rd fields. -Note that if rs1 specifies a register other than x0, and that register -holds a zero value, the instruction will not action any attendant per-field -side effects, but will action any side effects caused by writing to the entire -CSR.

-
-
-

A CSRRW with rs1=x0 will attempt to write zero to the destination CSR.

-
-
-

The CSRRWI, CSRRSI, and CSRRCI variants are similar to CSRRW, CSRRS, and -CSRRC respectively, except they update the CSR using an XLEN-bit value -obtained by zero-extending a 5-bit unsigned immediate (uimm[4:0]) field -encoded in the rs1 field instead of a value from an integer register. -For CSRRSI and CSRRCI, if the uimm[4:0] field is zero, then these -instructions will not write to the CSR, and shall not cause any of the -side effects that might otherwise occur on a CSR write, nor raise -illegal-instruction exceptions on accesses to read-only CSRs. For -CSRRWI, if rd=x0, then the instruction shall not read the CSR and -shall not cause any of the side effects that might occur on a CSR read. -Both CSRRSI and CSRRCI will always read the CSR and cause any read side -effects regardless of rd and rs1 fields.

-
- - ------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 8. Conditions determining whether a CSR instruction reads or writes the specified CSR.
Register operand

Instruction

rd is x0

rs1 is x0

Reads CSR

Writes CSR

CSRRW

Yes

-

No

Yes

CSRRW

No

-

Yes

Yes

CSRRS/CSRRC

-

Yes

Yes

No

CSRRS/CSRRC

-

No

Yes

Yes

Immediate operand

Instruction

rd is x0

uimmstem bf536b3e7d45040baab197f00bea9eba0

Reads CSR

Writes -CSR

CSRRWI

Yes

-

No

Yes

CSRRWI

No

-

Yes

Yes

CSRRSI/CSRRCI

-

Yes

Yes

No

CSRRSI/CSRRCI

-

No

Yes

Yes

-
-

Table 8 summarizes the behavior of the CSR -instructions with respect to whether they read and/or write the CSR.

-
-
-

In addition to side effects that occur as a consequence of reading or -writing a CSR, individual fields within a CSR might have side effects -when written. The CSRRW[I] instructions action side effects for all -such fields within the written CSR. The CSRRS[I] an CSRRC[I] instructions -only action side effects for fields for which the rs1 or uimm argument -has at least one bit set corresponding to that field.

-
-
- - - - - -
- - -
-

As of this writing, no standard CSRs have side effects on field writes. -Hence, whether a standard CSR access has any side effects can be determined -solely from the opcode.

-
-
-

Defining CSRs with side effects on field writes is not recommended.

-
-
-
-
-

For any event or consequence that occurs due to a CSR having a -particular value, if a write to the CSR gives it that value, the -resulting event or consequence is said to be an indirect effect of the -write. Indirect effects of a CSR write are not considered by the RISC-V -ISA to be side effects of that write.

-
-
- - - - - -
- - -
-

An example of side effects for CSR accesses would be if reading from a -specific CSR causes a light bulb to turn on, while writing an odd value -to the same CSR causes the light to turn off. Assume writing an even -value has no effect. In this case, both the read and write have side -effects controlling whether the bulb is lit, as this condition is not -determined solely from the CSR value. (Note that after writing an odd -value to the CSR to turn off the light, then reading to turn the light -on, writing again the same odd value causes the light to turn off again. -Hence, on the last write, it is not a change in the CSR value that turns -off the light.)

-
-
-

On the other hand, if a bulb is rigged to light whenever the value of a -particular CSR is odd, then turning the light on and off is not -considered a side effect of writing to the CSR but merely an indirect -effect of such writes.

-
-
-

More concretely, the RISC-V privileged architecture defined in Volume II -specifies that certain combinations of CSR values cause a trap to occur. -When an explicit write to a CSR creates the conditions that trigger the -trap, the trap is not considered a side effect of the write but merely -an indirect effect.

-
-
-

Standard CSRs do not have any side effects on reads. Standard CSRs may -have side effects on writes. Custom extensions might add CSRs for which -accesses have side effects on either reads or writes.

-
-
-
-
-

Some CSRs, such as the instructions-retired counter, instret, may be -modified as side effects of instruction execution. In these cases, if a -CSR access instruction reads a CSR, it reads the value prior to the -execution of the instruction. If a CSR access instruction writes such a -CSR, the explicit write is done instead of the update from the side effect. -In particular, a value -written to instret by one instruction will be the value read by the -following instruction.

-
-
-

The assembler pseudoinstruction to read a CSR, CSRR rd, csr, is -encoded as CSRRS rd, csr, x0. The assembler pseudoinstruction to write -a CSR, CSRW csr, rs1, is encoded as CSRRW x0, csr, rs1, while CSRWI -csr, uimm, is encoded as CSRRWI x0, csr, uimm.

-
-
-

Further assembler pseudoinstructions are defined to set and clear bits -in the CSR when the old value is not required: CSRS/CSRC csr, rs1; -CSRSI/CSRCI csr, uimm.

-
-
-

7.1.1. CSR Access Ordering

-
-

Each RISC-V hart normally observes its own CSR accesses, including its -implicit CSR accesses, as performed in program order. In particular, -unless specified otherwise, a CSR access is performed after the -execution of any prior instructions in program order whose behavior -modifies or is modified by the CSR state and before the execution of any -subsequent instructions in program order whose behavior modifies or is -modified by the CSR state. Furthermore, an explicit CSR read returns the -CSR state before the execution of the instruction, while an explicit CSR -write suppresses and overrides any implicit writes or modifications to -the same CSR by the same instruction.

-
-
-

Likewise, any side effects from an explicit CSR access are normally -observed to occur synchronously in program order. Unless specified -otherwise, the full consequences of any such side effects are observable -by the very next instruction, and no consequences may be observed -out-of-order by preceding instructions. (Note the distinction made -earlier between side effects and indirect effects of CSR writes.)

-
-
-

For the RVWMO memory consistency model (Chapter 18), CSR accesses are weakly -ordered by default, so other harts or devices may observe CSR accesses -in an order different from program order. In addition, CSR accesses are -not ordered with respect to explicit memory accesses, unless a CSR -access modifies the execution behavior of the instruction that performs -the explicit memory access or unless a CSR access and an explicit memory -access are ordered by either the syntactic dependencies defined by the -memory model or the ordering requirements defined by the Memory-Ordering -PMAs section in Volume II of this manual. To enforce ordering in all -other cases, software should execute a FENCE instruction between the -relevant accesses. For the purposes of the FENCE instruction, CSR read -accesses are classified as device input (I), and CSR write accesses are -classified as device output (O).

-
-
- - - - - -
- - -
-

Informally, the CSR space acts as a weakly ordered memory-mapped I/O -region, as defined by the Memory-Ordering PMAs section in Volume II of -this manual. As a result, the order of CSR accesses with respect to all -other accesses is constrained by the same mechanisms that constrain the -order of memory-mapped I/O accesses to such a region.

-
-
-

These CSR-ordering constraints are imposed to support ordering main -memory and memory-mapped I/O accesses with respect to CSR accesses that -are visible to, or affected by, devices or other harts. Examples include -the time, cycle, and mcycle CSRs, in addition to CSRs that reflect -pending interrupts, like mip and sip. Note that implicit reads of -such CSRs (e.g., taking an interrupt because of a change in mip) are -also ordered as device input.

-
-
-

Most CSRs (including, e.g., the fcsr) are not visible to other harts; -their accesses can be freely reordered in the global memory order with -respect to FENCE instructions without violating this specification.

-
-
-
-
-

The hardware platform may define that accesses to certain CSRs are -strongly ordered, as defined by the Memory-Ordering PMAs section in -Volume II of this manual. Accesses to strongly ordered CSRs have -stronger ordering constraints with respect to accesses to both weakly -ordered CSRs and accesses to memory-mapped I/O regions.

-
-
- - - - - -
- - -
-

The rules for the reordering of CSR accesses in the global memory order -should probably be moved to Chapter 18 concerning the RVWMO memory consistency model.

-
-
-
-
-
-
-
-
-

8. "Zicntr" and "Zihpm" Extensions for Counters, Version 2.0

-
-
-

RISC-V ISAs provide a set of up to thirty-two 64-bit performance -counters and timers that are accessible via unprivileged XLEN-bit -read-only CSR registers 0xC000xC1F (when XLEN=32, the upper 32 bits -are accessed via CSR registers 0xC800xC9F). These counters are -divided between the "Zicntr" and "Zihpm" extensions.

-
-
-

8.1. "Zicntr" Extension for Base Counters and Timers

-
-

The Zicntr standard extension comprises the first three of these -counters (CYCLE, TIME, and INSTRET), which have dedicated functions -(cycle count, real-time clock, and instructions retired, respectively). -The Zicntr extension depends on the Zicsr extension.

-
-
- - - - - -
- - -
-

We recommend provision of these basic counters in implementations as -they are essential for basic performance analysis, adaptive and dynamic -optimization, and to allow an application to work with real-time -streams. Additional counters in the separate Zihpm extension can help -diagnose performance problems and these should be made accessible from -user-level application code with low overhead.

-
-
-

Some execution environments might prohibit access to counters, for -example, to impede timing side-channel attacks.

-
-
-
-
-
-Diagram -
-
-
-

For base ISAs with XLEN≥64, CSR instructions can access -the full 64-bit CSRs directly. In particular, the RDCYCLE, RDTIME, and -RDINSTRET pseudoinstructions read the full 64 bits of the cycle, -time, and instret counters.

-
-
- - - - - -
- - -
-

The counter pseudoinstructions are mapped to the read-only -csrrs rd, counter, x0 canonical form, but the other read-only CSR -instruction forms (based on CSRRC/CSRRSI/CSRRCI) are also legal ways to -read these CSRs.

-
-
-
-
-

For base ISAs with XLEN=32, the Zicntr extension enables the three -64-bit read-only counters to be accessed in 32-bit pieces. The RDCYCLE, -RDTIME, and RDINSTRET pseudoinstructions provide the lower 32 bits, and -the RDCYCLEH, RDTIMEH, and RDINSTRETH pseudoinstructions provide the -upper 32 bits of the respective counters.

-
-
- - - - - -
- - -
-

We required the counters be 64 bits wide, even when XLEN=32, as -otherwise it is very difficult for software to determine if values have -overflowed. For a low-end implementation, the upper 32 bits of each -counter can be implemented using software counters incremented by a trap -handler triggered by overflow of the lower 32 bits. The sample code -given below shows how the full 64-bit width value can be safely read -using the individual 32-bit width pseudoinstructions.

-
-
-
-
-

The RDCYCLE pseudoinstruction reads the low XLEN bits of the cycle -CSR which holds a count of the number of clock cycles executed by the -processor core on which the hart is running from an arbitrary start time -in the past. RDCYCLEH is only present when XLEN=32 and reads bits 63-32 -of the same cycle counter. The underlying 64-bit counter should never -overflow in practice. The rate at which the cycle counter advances will -depend on the implementation and operating environment. The execution -environment should provide a means to determine the current rate -(cycles/second) at which the cycle counter is incrementing.

-
-
- - - - - -
- - -
-

RDCYCLE is intended to return the number of cycles executed by the -processor core, not the hart. Precisely defining what is a "core" is -difficult given some implementation choices (e.g., AMD Bulldozer). -Precisely defining what is a "clock cycle" is also difficult given the -range of implementations (including software emulations), but the intent -is that RDCYCLE is used for performance monitoring along with the other -performance counters. In particular, where there is one hart/core, one -would expect cycle-count/instructions-retired to measure CPI for a hart.

-
-
-

Cores don’t have to be exposed to software at all, and an implementor -might choose to pretend multiple harts on one physical core are running -on separate cores with one hart/core, and provide separate cycle -counters for each hart. This might make sense in a simple barrel -processor (e.g., CDC 6600 peripheral processors) where inter-hart timing -interactions are non-existent or minimal.

-
-
-

Where there is more than one hart/core and dynamic multithreading, it is -not generally possible to separate out cycles per hart (especially with -SMT). It might be possible to define a separate performance counter that -tried to capture the number of cycles a particular hart was running, but -this definition would have to be very fuzzy to cover all the possible -threading implementations. For example, should we only count cycles for -which any instruction was issued to execution for this hart, and/or -cycles any instruction retired, or include cycles this hart was -occupying machine resources but couldn’t execute due to stalls while -other harts went into execution? Likely, "all of the above" would be -needed to have understandable performance stats. This complexity of -defining a per-hart cycle count, and also the need in any case for a -total per-core cycle count when tuning multithreaded code led to just -standardizing the per-core cycle counter, which also happens to work -well for the common single hart/core case.

-
-
-

Standardizing what happens during "sleep" is not practical given that -what "sleep" means is not standardized across execution environments, -but if the entire core is paused (entirely clock-gated or powered-down -in deep sleep), then it is not executing clock cycles, and the cycle -count shouldn’t be increasing per the spec. There are many details, -e.g., whether clock cycles required to reset a processor after waking up -from a power-down event should be counted, and these are considered -execution-environment-specific details.

-
-
-

Even though there is no precise definition that works for all platforms, -this is still a useful facility for most platforms, and an imprecise, -common, "usually correct" standard here is better than no standard. -The intent of RDCYCLE was primarily performance monitoring/tuning, and -the specification was written with that goal in mind.

-
-
-
-
-

The RDTIME pseudoinstruction reads the low XLEN bits of the "time" CSR, -which counts wall-clock real time that has passed from an arbitrary -start time in the past. RDTIMEH is only present when XLEN=32 and reads -bits 63-32 of the same real-time counter. The underlying 64-bit counter -increments by one with each tick of the real-time clock, and, for -realistic real-time clock frequencies, should never overflow in -practice. The execution environment should provide a means of -determining the period of a counter tick (seconds/tick). The period -should be constant within a small error bound. The environment should -provide a means to determine the accuracy of the clock (i.e., the -maximum relative error between the nominal and actual real-time clock -periods).

-
-
- - - - - -
- - -
-

On some simple platforms, cycle count might represent a valid -implementation of RDTIME, in which case RDTIME and RDCYCLE may return -the same result.

-
-
-

It is difficult to provide a strict mandate on clock period given the -wide variety of possible implementation platforms. The maximum error -bound should be set based on the requirements of the platform.

-
-
-
-
-

The real-time clocks of all harts must be synchronized to within one -tick of the real-time clock.

-
-
- - - - - -
- - -
-

As with other architectural mandates, it suffices to appear "as if" -harts are synchronized to within one tick of the real-time clock, i.e., -software is unable to observe that there is a greater delta between the -real-time clock values observed on two harts.

-
-
-
-
-

The RDINSTRET pseudoinstruction reads the low XLEN bits of the -instret CSR, which counts the number of instructions retired by this -hart from some arbitrary start point in the past. RDINSTRETH is only -present when XLEN=32 and reads bits 63-32 of the same instruction -counter. The underlying 64-bit counter should never overflow in -practice.

-
-
- - - - - -
- - -
-

Instructions that cause synchronous exceptions, including ECALL and -EBREAK, are not considered to retire and hence do not increment the -instret CSR.

-
-
-
-
-

The following code sequence will read a valid 64-bit cycle counter value -into x3:x2, even if the counter overflows its lower half between -reading its upper and lower halves.

-
-
-
Listing 1. Sample code for reading the 64-bit cycle counter when XLEN=32.
-
-
    again:
-        rdcycleh     x3
-        rdcycle      x2
-        rdcycleh     x4
-        bne          x3, x4, again
-
-
-
-
-

8.2. "Zihpm" Extension for Hardware Performance Counters

- -
-
-
-
-

9. "Zihintntl" Extension for Non-Temporal Locality Hints, Version 1.0

-
- -
-
-
-

10. "Zihintpause" Extension for Pause Hint, Version 2.0

-
- -
-
-
-

11. "Zimop" Extension for May-Be-Operations, Version 1.0

-
-
-

11.1. "Zcmop" Compressed May-Be-Operations Extension, Version 1.0

- -
-
-
-
-

12. "Zicond" Extension for Integer Conditional Operations, Version 1.0.0

-
- -
-
-
-

13. "M" Extension for Integer Multiplication and Division, Version 2.0

-
-
-

This chapter describes the standard integer multiplication and division -instruction extension, which is named "M" and contains instructions -that multiply or divide values held in two integer registers.

-
-
- - - - - -
- - -
-

We separate integer multiply and divide out from the base to simplify -low-end implementations, or for applications where integer multiply and -divide operations are either infrequent or better handled in attached -accelerators.

-
-
-
-
-

13.1. Multiplication Operations

-
-
-Diagram -
-
-
-

- -

-
-
-

MUL performs an XLEN-bit×XLEN-bit multiplication of -rs1 by rs2 and places the lower XLEN bits in the destination -register. MULH, MULHU, and MULHSU perform the same multiplication but -return the upper XLEN bits of the full 2×XLEN-bit -product, for signed×signed, -unsigned×unsigned, and rs1×unsigned rs2 multiplication, respectively. -If both the high and low bits of the same product are required, then the recommended code sequence is: MULH[[S]U] rdh, rs1, rs2; MUL rdl, rs1, rs2 (source register specifiers must be in same order and rdh cannot be the same as rs1 or rs2). Microarchitectures can then fuse these into a single multiply operation instead of performing two separate multiplies.

-
-
- - - - - -
- - -
-

MULHSU is used in multi-word signed multiplication to multiply the -most-significant word of the multiplicand (which contains the sign bit) -with the less-significant words of the multiplier (which are unsigned).

-
-
-
-
-

MULW is an RV64 instruction that multiplies the lower 32 bits of the -source registers, placing the sign extension of the lower 32 bits of the -result into the destination register.

-
-
- - - - - -
- - -
-

In RV64, MUL can be used to obtain the upper 32 bits of the 64-bit -product, but signed arguments must be proper 32-bit signed values, -whereas unsigned arguments must have their upper 32 bits clear. If the -arguments are not known to be sign- or zero-extended, an alternative is -to shift both arguments left by 32 bits, then use MULH[[S]U].

-
-
-
-
-
-

13.2. Division Operations

-
-
-Diagram -
-
-
-

-

-
-
-

DIV and DIVU perform an XLEN bits by XLEN bits signed and unsigned -integer division of rs1 by rs2, rounding towards zero. REM and REMU -provide the remainder of the corresponding division operation. For REM, -the sign of a nonzero result equals the sign of the dividend.

-
-
- - - - - -
- - -
-

For both signed and unsigned division, except in the case of overflow, it holds -that -stem 5737b426f42585386046d1ae40cf6e65.

-
-
-
-
-

If both the quotient and remainder are required from the same division, -the recommended code sequence is: DIV[U] rdq, rs1, rs2; REM[U] rdr, -rs1, rs2 (rdq cannot be the same as rs1 or rs2). -Microarchitectures can then fuse these into a single divide operation -instead of performing two separate divides.

-
-
-

DIVW and DIVUW are RV64 instructions that divide the lower 32 bits of -rs1 by the lower 32 bits of rs2, treating them as signed and -unsigned integers respectively, placing the 32-bit quotient in rd, -sign-extended to 64 bits. REMW and REMUW are RV64 instructions that -provide the corresponding signed and unsigned remainder operations -respectively. Both REMW and REMUW always sign-extend the 32-bit result -to 64 bits, including on a divide by zero. -

-
-
-

The semantics for division by zero and division overflow are summarized -in Table 9. The quotient of division by zero has all bits -set, and the remainder of division by zero equals the dividend. Signed -division overflow occurs only when the most-negative integer is divided -by stem 52c1acbf19a35be815dca1fd989d89e4. The quotient of a signed division with overflow is -equal to the dividend, and the remainder is zero. Unsigned division -overflow cannot occur.

-
- - --------- - - - - - - - - - - - - - - - - - - - - - - -
Table 9. Semantics for division by zero and division overflow. L is the width of the operation in bits: XLEN for DIV[U] and REM[U], or 32 for DIV[U]W and REM[U]W.
ConditionDividendDivisorDIVU[W]REMU[W]DIV[W]REM[W]

Division by zero
-Overflow (signed only)

stem 7073627e9999e583f5539cb4560a14d7
-stem 1f312cf5724cc9327753b6aa6e2b3c6c

0
-stem 52c1acbf19a35be815dca1fd989d89e4

stem 2211f7c99ec951c50baf5ffaf8b1b52d
- -

stem 7073627e9999e583f5539cb4560a14d7
- -

stem 52c1acbf19a35be815dca1fd989d89e4
- stem 1f312cf5724cc9327753b6aa6e2b3c6c

stem 7073627e9999e583f5539cb4560a14d7
- 0

-
- - - - - -
- - -
-

We considered raising exceptions on integer divide by zero, with these -exceptions causing a trap in most execution environments. However, this -would be the only arithmetic trap in the standard ISA (floating-point -exceptions set flags and write default values, but do not cause traps) -and would require language implementors to interact with the execution -environment’s trap handlers for this case. Further, where language -standards mandate that a divide-by-zero exception must cause an -immediate control flow change, only a single branch instruction needs to -be added to each divide operation, and this branch instruction can be -inserted after the divide and should normally be very predictably not -taken, adding little runtime overhead.

-
-
-

The value of all bits set is returned for both unsigned and signed -divide by zero to simplify the divider circuitry. The value of all 1s is -both the natural value to return for unsigned divide, representing the -largest unsigned number, and also the natural result for simple unsigned -divider implementations. Signed division is often implemented using an -unsigned division circuit and specifying the same overflow result -simplifies the hardware.

-
-
-
-
-
-

13.3. Zmmul Extension, Version 1.0

-
-

The Zmmul extension implements the multiplication subset of the M -extension. It adds all of the instructions defined in -Section 13.1, namely: MUL, MULH, MULHU, -MULHSU, and (for RV64 only) MULW. The encodings are identical to those -of the corresponding M-extension instructions. M implies Zmmul. -

-
-
- - - - - -
- - -
-

The Zmmul extension enables low-cost implementations that require -multiplication operations but not division. For many microcontroller -applications, division operations are too infrequent to justify the cost -of divider hardware. By contrast, multiplication operations are more -frequent, making the cost of multiplier hardware more justifiable. -Simple FPGA soft cores particularly benefit from eliminating division -but retaining multiplication, since many FPGAs provide hardwired -multipliers but require dividers be implemented in soft logic.

-
-
-
-
-
-
-
-

14. "A" Extension for Atomic Instructions, Version 2.1

-
-
-

CV64A6_MMU: This extension is not supported.

-
-
-
-
-

15. "Zawrs" Extension for Wait-on-Reservation-Set instructions, Version 1.01

-
- -
-
-
-

16. "Zacas" Extension for Atomic Compare-and-Swap (CAS) Instructions, Version 1.0.0

-
- -
-
-
-

17. "Zabha" Extension for Byte and Halfword Atomic Memory Operations, Version 1.0.0

-
- -
-
-
-

18. RVWMO Memory Consistency Model, Version 2.0

-
-
-

This chapter defines the RISC-V memory consistency model. A memory -consistency model is a set of rules specifying the values that can be -returned by loads of memory. RISC-V uses a memory model called "RVWMO" -(RISC-V Weak Memory Ordering) which is designed to provide flexibility -for architects to build high-performance scalable designs while -simultaneously supporting a tractable programming model. - -

-
-
-

Under RVWMO, code running on a single hart appears to execute in order -from the perspective of other memory instructions in the same hart, but -memory instructions from another hart may observe the memory -instructions from the first hart being executed in a different order. -Therefore, multithreaded code may require explicit synchronization to -guarantee ordering between memory instructions from different harts. The -base RISC-V ISA provides a FENCE instruction for this purpose, described -in Section 2.7, while the atomics extension "A" additionally defines load-reserved/store-conditional and atomic read-modify-write instructions. -

-
-
-

The standard ISA extension for total store ordering "Ztso" (Chapter 19) augments -RVWMO with additional rules specific to those extensions.

-
-
-

The appendices to this specification provide both axiomatic and -operational formalizations of the memory consistency model as well as -additional explanatory material. - -

-
-
- - - - - -
- - -
-

This chapter defines the memory model for regular main memory -operations. The interaction of the memory model with I/O memory, -instruction fetches, FENCE.I, page table walks, and SFENCE.VMA is not -(yet) formalized. Some or all of the above may be formalized in a future -revision of this specification. The RV128 base ISA and future ISA -extensions such as the V vector and J JIT extensions will need -to be incorporated into a future revision as well.

-
-
-

Memory consistency models supporting overlapping memory accesses of -different widths simultaneously remain an active area of academic -research and are not yet fully understood. The specifics of how memory -accesses of different sizes interact under RVWMO are specified to the -best of our current abilities, but they are subject to revision should -new issues be uncovered.

-
-
-
-
-

18.1. Definition of the RVWMO Memory Model

-
-

The RVWMO memory model is defined in terms of the global memory order, -a total ordering of the memory operations produced by all harts. In -general, a multithreaded program has many different possible executions, -with each execution having its own corresponding global memory order. -

-
-
-

The global memory order is defined over the primitive load and store -operations generated by memory instructions. It is then subject to the -constraints defined in the rest of this chapter. Any execution -satisfying all of the memory model constraints is a legal execution (as -far as the memory model is concerned).

-
-
-

18.1.1. Memory Model Primitives

-
-

The program order over memory operations reflects the order in which -the instructions that generate each load and store are logically laid -out in that hart’s dynamic instruction stream; i.e., the order in which -a simple in-order processor would execute the instructions of that hart.

-
-
-

Memory-accessing instructions give rise to memory operations. A memory -operation can be either a load operation, a store operation, or both -simultaneously. All memory operations are single-copy atomic: they can -never be observed in a partially complete state. -

-
-
-

Among instructions in RV32GC and RV64GC, each aligned memory instruction -gives rise to exactly one memory operation, with two exceptions. First, -an unsuccessful SC instruction does not give rise to any memory -operations. Second, FLD and FSD instructions may each give rise to -multiple memory operations if XLEN<64, as stated in -[fld_fsd] and clarified below. An aligned AMO -gives rise to a single memory operation that is both a load operation -and a store operation simultaneously.

-
-
- - - - - -
- - -
-

Instructions in the RV128 base instruction set and in future ISA -extensions such as V (vector) and P (SIMD) may give rise to multiple -memory operations. However, the memory model for these extensions has -not yet been formalized.

-
-
-
-
-

A misaligned load or store instruction may be decomposed into a set of -component memory operations of any granularity. An FLD or FSD -instruction for which XLEN<64 may also be decomposed into -a set of component memory operations of any granularity. The memory -operations generated by such instructions are not ordered with respect -to each other in program order, but they are ordered normally with -respect to the memory operations generated by preceding and subsequent -instructions in program order. -The atomics extension "A" does not require execution environments to support -misaligned atomic instructions at all. -However, if misaligned atomics are supported via the misaligned atomicity -granule PMA, then AMOs within an atomicity granule are not decomposed, nor are -loads and stores defined in the base ISAs, nor are loads and stores of no more -than XLEN bits defined in the F, D, and Q extensions. -

-
-
- - - - - -
- - -
-

The decomposition of misaligned memory operations down to byte -granularity facilitates emulation on implementations that do not -natively support misaligned accesses. Such implementations might, for -example, simply iterate over the bytes of a misaligned access one by -one.

-
-
-
-
-

An LR instruction and an SC instruction are said to be paired if the -LR precedes the SC in program order and if there are no other LR or SC -instructions in between; the corresponding memory operations are said to -be paired as well (except in case of a failed SC, where no store -operation is generated). The complete list of conditions determining -whether an SC must succeed, may succeed, or must fail is defined in -[sec:lrsc].

-
-
-

Load and store operations may also carry one or more ordering -annotations from the following set: "acquire-RCpc", "acquire-RCsc", -"release-RCpc", and "release-RCsc". An AMO or LR instruction with -aq set has an "acquire-RCsc" annotation. An AMO or SC instruction -with rl set has a "release-RCsc" annotation. An AMO, LR, or SC -instruction with both aq and rl set has both "acquire-RCsc" and -"release-RCsc" annotations.

-
-
-

For convenience, we use the term "acquire annotation" to refer to an -acquire-RCpc annotation or an acquire-RCsc annotation. Likewise, a -"release annotation" refers to a release-RCpc annotation or a -release-RCsc annotation. An "RCpc annotation" refers to an -acquire-RCpc annotation or a release-RCpc annotation. An RCsc -annotation refers to an acquire-RCsc annotation or a release-RCsc -annotation.

-
-
- - - - - -
- - -
-

In the memory model literature, the term "RCpc" stands for release -consistency with processor-consistent synchronization operations, and -the term "RCsc" stands for release consistency with sequentially -consistent synchronization operations.

-
-
-

While there are many different definitions for acquire and release -annotations in the literature, in the context of RVWMO these terms are -concisely and completely defined by Preserved Program Order rules 5-7.

-
-
-

"RCpc" annotations are currently only used when implicitly assigned to -every memory access per the standard extension "Ztso" -(Chapter 19). Furthermore, although the ISA does not -currently contain native load-acquire or store-release instructions, nor -RCpc variants thereof, the RVWMO model itself is designed to be -forwards-compatible with the potential addition of any or all of the -above into the ISA in a future extension.

-
-
-
-
-
-

18.1.2. Syntactic Dependencies

-
-

The definition of the RVWMO memory model depends in part on the notion -of a syntactic dependency, defined as follows.

-
-
-

In the context of defining dependencies, a register refers either to -an entire general-purpose register, some portion of a CSR, or an entire -CSR. The granularity at which dependencies are tracked through CSRs is -specific to each CSR and is defined in -Section 18.2.

-
-
-

Syntactic dependencies are defined in terms of instructions' source -registers, instructions' destination registers, and the way -instructions carry a dependency from their source registers to their -destination registers. This section provides a general definition of all -of these terms; however, Section 18.3 provides a -complete listing of the specifics for each instruction.

-
-
-

In general, a register r other than x0 is a source -register for an instruction i if any of the following -hold:

-
-
-
    -
  • -

    In the opcode of i, rs1, rs2, or rs3 is set to -r

    -
  • -
  • -

    i is a CSR instruction, and in the opcode of -i, csr is set to r, unless i -is CSRRW or CSRRWI and rd is set to x0

    -
  • -
  • -

    r is a CSR and an implicit source register for -i, as defined in Section 18.3

    -
  • -
  • -

    r is a CSR that aliases with another source register for -i

    -
  • -
-
-
-

Memory instructions also further specify which source registers are -address source registers and which are data source registers.

-
-
-

In general, a register r other than x0 is a destination -register for an instruction i if any of the following -hold:

-
-
-
    -
  • -

    In the opcode of i, rd is set to r

    -
  • -
  • -

    i is a CSR instruction, and in the opcode of -i, csr is set to r, unless i -is CSRRS or CSRRC and rs1 is set to x0 or i is CSRRSI -or CSRRCI and uimm[4:0] is set to zero.

    -
  • -
  • -

    r is a CSR and an implicit destination register for -i, as defined in Section 18.3

    -
  • -
  • -

    r is a CSR that aliases with another destination -register for i

    -
  • -
-
-
-

Most non-memory instructions carry a dependency from each of their -source registers to each of their destination registers. However, there -are exceptions to this rule; see Section 18.3.

-
-
-

Instruction j has a syntactic dependency on instruction -i via destination register s of -i and source register r of j -if either of the following hold:

-
-
-
    -
  • -

    s is the same as r, and no instruction -program-ordered between i and j has -r as a destination register

    -
  • -
  • -

    There is an instruction m program-ordered between -i and j such that all of the following hold:

    -
    -
      -
    1. -

      j has a syntactic dependency on m via -destination register q and source register r

      -
    2. -
    3. -

      m has a syntactic dependency on i via -destination register s and source register p

      -
    4. -
    5. -

      m carries a dependency from p to -q

      -
    6. -
    -
    -
  • -
-
-
-

Finally, in the definitions that follow, let a and -b be two memory operations, and let i and -j be the instructions that generate a and -b, respectively.

-
-
-

b has a syntactic address dependency on a -if r is an address source register for j and -j has a syntactic dependency on i via source -register r

-
-
-

b has a syntactic data dependency on a if -b is a store operation, r is a data source -register for j, and j has a syntactic -dependency on i via source register r

-
-
-

b has a syntactic control dependency on a -if there is an instruction m program-ordered between -i and j such that m is a -branch or indirect jump and m has a syntactic dependency -on i.

-
-
- - - - - -
- - -
-

Generally speaking, non-AMO load instructions do not have data source -registers, and unconditional non-AMO store instructions do not have -destination registers. However, a successful SC instruction is -considered to have the register specified in rd as a destination -register, and hence it is possible for an instruction to have a -syntactic dependency on a successful SC instruction that precedes it in -program order.

-
-
-
-
-
-

18.1.3. Preserved Program Order

-
-

The global memory order for any given execution of a program respects -some but not all of each hart’s program order. The subset of program -order that must be respected by the global memory order is known as -preserved program order.

-
-
-

The complete definition of preserved program order is as follows (and -note that AMOs are simultaneously both loads and stores): memory -operation a precedes memory operation b in -preserved program order (and hence also in the global memory order) if -a precedes b in program order, -a and b both access regular main memory -(rather than I/O regions), and any of the following hold:

-
-
-
    -
  • -

    Overlapping-Address Orderings:

    -
    -
      -
    1. -

      b is a store, and -a and b access overlapping memory addresses

      -
    2. -
    3. -

      a and b are loads, -x is a byte read by both a and -b, there is no store to x between -a and b in program order, and -a and b return values for x -written by different memory operations

      -
    4. -
    5. -

      a is -generated by an AMO or SC instruction, b is a load, and -b returns a value written by a

      -
    6. -
    -
    -
  • -
  • -

    Explicit Synchronization

    -
    -
      -
    1. -

      There is a FENCE instruction that -orders a before b

      -
    2. -
    3. -

      a has an acquire -annotation

      -
    4. -
    5. -

      b has a release annotation

      -
    6. -
    7. -

      a and b both have -RCsc annotations

      -
    8. -
    9. -

      a is paired with -b

      -
    10. -
    -
    -
  • -
  • -

    Syntactic Dependencies

    -
    -
      -
    1. -

      b has a syntactic address -dependency on a

      -
    2. -
    3. -

      b has a syntactic data -dependency on a

      -
    4. -
    5. -

      b is a store, and -b has a syntactic control dependency on a

      -
    6. -
    -
    -
  • -
  • -

    Pipeline Dependencies

    -
    -
      -
    1. -

      b is a -load, and there exists some store m between -a and b in program order such that -m has an address or data dependency on a, -and b returns a value written by m

      -
    2. -
    3. -

      b is a store, and -there exists some instruction m between a -and b in program order such that m has an -address dependency on a

      -
    4. -
    -
    -
  • -
-
-
-
-

18.1.4. Memory Model Axioms

-
-

An execution of a RISC-V program obeys the RVWMO memory consistency -model only if there exists a global memory order conforming to preserved -program order and satisfying the load value axiom, the atomicity -axiom, and the progress axiom.

-
-
-
Load Value Axiom
-
-

Each byte of each load i returns the value written to that -byte by the store that is the latest in global memory order among the -following stores:

-
-
-
    -
  1. -

    Stores that write that byte and that precede i in the -global memory order

    -
  2. -
  3. -

    Stores that write that byte and that precede i in -program order

    -
  4. -
-
-
-
-
Atomicity Axiom
-
-

If r and w are paired load and store -operations generated by aligned LR and SC instructions in a hart -h, s is a store to byte x, and -r returns a value written by s, then -s must precede w in the global memory order, -and there can be no store from a hart other than h to byte -x following s and preceding w -in the global memory order.

-
-
- - - - - -
- - -
-

The Atomicity Axiom theoretically supports LR/SC pairs of different widths and to -mismatched addresses, since implementations are permitted to allow SC -operations to succeed in such cases. However, in practice, we expect -such patterns to be rare, and their use is discouraged.

-
-
-
-
-
-
Progress Axiom
-
-

No memory operation may be preceded in the global memory order by an -infinite sequence of other memory operations.

-
-
-
-
-
-

18.2. CSR Dependency Tracking Granularity

- - ----- - - - - - - - - - - - - - - - - - - - - - - - - -
Table 10. Granularities at which syntactic dependencies are tracked through CSRs
NamePortions Tracked as Independent UnitsAliases

fflags

Bits 4, 3, 2, 1, 0

fcsr

frm

entire CSR

fcsr

fcsr

Bits 7-5, 4, 3, 2, 1, 0

fflags, frm

-
-

Note: read-only CSRs are not listed, as they do not participate in the -definition of syntactic dependencies.

-
-
-
-

18.3. Source and Destination Register Listings

-
-

This section provides a concrete listing of the source and destination -registers for each instruction. These listings are used in the -definition of syntactic dependencies in -Section 18.1.2.

-
-
-

The term "accumulating CSR" is used to describe a CSR that is both a -source and a destination register, but which carries a dependency only -from itself to itself.

-
-
-

Instructions carry a dependency from each source register in the -"Source Registers" column to each destination register in the -"Destination Registers" column, from each source register in the -"Source Registers" column to each CSR in the "Accumulating CSRs" -column, and from each CSR in the "Accumulating CSRs" column to itself, -except where annotated otherwise.

-
-
-

Key:

-
-
-
    -
  • -

    AAddress source register

    -
  • -
  • -

    DData source register

    -
  • -
  • -

    † The instruction does not carry a dependency from -any source register to any destination register

    -
  • -
  • -

    ‡ The instruction carries dependencies from source -register(s) to destination register(s) as specified

    -
  • -
-
- - ------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 11. RV32I Base Integer Instruction Set
Source RegistersDestination RegistersAccumulating CSRs

LUI

rd

AUIPC

rd

JAL

rd

JALR†

rs1

rd

BEQ

rs1, rs2

BNE

rs1, rs2

BLT

rs1, rs2

BGE

rs1, rs2

BLTU

rs1, rs2

BGEU

rs1, rs2

LB †

rs1 A

rd

LH †

rs1 A

rd

LW †

rs1 A

rd

LBU †

rs1 A

rd

LHU †

rs1 A

rd

SB

rs1 A, rs2 D

SH

rs1 A, rs2 D

SW

rs1 A, rs2 D

ADDI

rs1

rd

SLTI

rs1

rd

SLTIU

rs1

rd

XORI

rs1

rd

ORI

rs1

rd

ANDI

rs1

rd

SLLI

rs1

rd

SRLI

rs1

rd

SRAI

rs1

rd

ADD

rs1, rs2

rd

SUB

rs1, rs2

rd

SLL

rs1, rs2

rd

SLT

rs1, rs2

rd

SLTU

rs1, rs2

rd

XOR

rs1, rs2

rd

SRL

rs1, rs2

rd

SRA

rs1, rs2

rd

OR

rs1, rs2

rd

AND

rs1, rs2

rd

FENCE

FENCE.I

ECALL

EBREAK

CSRRW‡

rs1, csr*

rd, csr

*unless rd=x0

CSRRS‡

rs1, csr

rd *, csr

*unless rs1=x0

CSRRC‡

rs1, csr

rd *, csr

*unless rs1=x0

‡ carries a dependency from rs1 to csr and from csr to rd

CSRRWI ‡

csr *

rd, csr

*unless rd=x0

CSRRSI ‡

csr

rd, csr*

*unless uimm[4:0]=0

CSRRCI ‡

csr

rd, csr*

*unless uimm[4:0]=0

‡ carries a dependency from csr to rd

- - ------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 12. RV64I Base Integer Instruction Set
Source RegistersDestination RegistersAccumulating CSRs

LWU

rs1 A

rd

LD

rs1 A

rd

SD

rs1 A, rs2 D

SLLI

rs1

rd

SRLI

rs1

rd

SRAI

rs1

rd

ADDIW

rs1

rd

SLLIW

rs1

rd

SRLIW

rs1

rd

SRAIW

rs1

rd

ADDW

rs1, rs2

rd

SUBW

rs1, rs2

rd

SLLW

rs1, rs2

rd

SRLW

rs1, rs2

rd

SRAW

rs1, rs2

rd

- - ------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 13. RV32M Standard Extension
Source RegistersDestination RegistersAccumulating CSRs

MUL

rs1, rs2

rd

MULH

rs1, rs2

rd

MULHSU

rs1, rs2

rd

MULHU

rs1, rs2

rd

DIV

rs1, rs2

rd

DIVU

rs1, rs2

rd

REM

rs1, rs2

rd

REMU

rs1, rs2

rd

- - ------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 14. RV64M Standard Extension
Source RegistersDestination RegistersAccumulating CSRs

MULW

rs1, rs2

rd

DIVW

rs1, rs2

rd

DIVUW

rs1, rs2

rd

REMW

rs1, rs2

rd

REMUW

rs1, rs2

rd

- - ------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 15. RV32A Standard Extension
Source RegistersDestination RegistersAccumulating CSRs

LR.W†

rs1 A

rd

SC.W†

rs1 A, rs2 D

rd *

* if successful

AMOSWAP.W†

rs1 A, rs2 D

rd

AMOADD.W†

rs1 A, rs2 D

rd

AMOXOR.W†

rs1 A, rs2 D

rd

AMOAND.W†

rs1 A, rs2 D

rd

AMOOR.W†

rs1 A, rs2D

rd

AMOMIN.W†

rs1 A, rs2 D

rd

AMOMAX.W†

rs1 A, rs2 D

rd

AMOMINU.W†

rs1 A, rs2 D

rd

AMOMAXU.W†

rs1 A, rs2 D

rd

- - ------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 16. RV64A Standard Extension
Source RegistersDestination RegistersAccumulating CSRs

LR.D†

rs1 A

rd

SC.D†

rs1 A, rs2 D

rd *

*if successful

AMOSWAP.D†

rs1 A, rs2 D

rd

AMOADD.D†

rs1 A, rs2 D

rd

AMOXOR.D†

rs1 A, rs2 D

rd

AMOAND.D†

rs1 A, rs2D

rd

AMOOR.D†

rs1 A, rs2D

rd

AMOMIN.D†

rs1 A, rs2D

rd

AMOMAX.D†

rs1 A, rs2D

rd

AMOMINU.D†

rs1 A, rs2D

rd

AMOMAXU.D†

rs1 A, rs2D

rd

- - ------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 17. RV32F Standard Extension
Source RegistersDestination RegistersAccumulating CSRs

FLW†

rs1 A

rd

FSW

rs1 A, rs2D

FMADD.S

rs1, rs2, rs3, frm*

rd

NV, OF, UF, NX

*if rm=111

FMSUB.S

rs1, rs2, rs3, frm*

rd

NV, OF, UF, NX

*if rm=111

FNMSUB.S

rs1, rs2, rs3, frm*

rd

NV, OF, UF, NX

*if rm=111

FNMADD.S

rs1, rs2, rs3, frm*

rd

NV, OF, UF, NX

*if rm=111

FADD.S

rs1, rs2, frm*

rd

NV, OF, NX

*if rm=111

FSUB.S

rs1, rs2, frm*

rd

NV, OF, NX

*if rm=111

FMUL.S

rs1, rs2, frm*

rd

NV, OF, UF, NX

*if rm=111

FDIV.S

rs1, rs2, frm*

rd

NV, DZ, OF, UF, NX

*if rm=111

FSQRT.S

rs1, frm*

rd

NV, NX

*if rm=111

FSGNJ.S

rs1, rs2

rd

FSGNJN.S

rs1, rs2

rd

FSGNJX.S

rs1, rs2

rd

FMIN.S

rs1, rs2

rd

NV

FMAX.S

rs1, rs2

rd

NV

FCVT.W.S

rs1, frm*

rd

NV, NX

*if rm=111

FCVT.WU.S

rs1, frm*

rd

NV, NX

*if rm=111

FMV.X.W

rs1

rd

FEQ.S

rs1, rs2

rd

NV

FLT.S

rs1, rs2

rd

NV

FLE.S

rs1, rs2

rd

NV

FCLASS.S

rs1

rd

FCVT.S.W

rs1, frm*

rd

NX

*if rm=111

FCVT.S.WU

rs1, frm*

rd

NX

*if rm=111

FMV.W.X

rs1

rd

- - ------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 18. RV64F Standard Extension
Source RegistersDestination RegistersAccumulating CSRs

FCVT.L.S

rs1, frm*

rd

NV, NX

*if rm=111

FCVT.LU.S

rs1, frm*

rd

NV, NX

*if rm=111

FCVT.S.L

rs1, frm*

rd

NX

*if rm=111

FCVT.S.LU

rs1, frm*

rd

NX

*if rm=111

- - ------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 19. RV32D Standard Extension
Source RegistersDestination RegistersAccumulating CSRs

FLD†

rs1 A

rd

FSD

rs1 A, rs2D

FMADD.D

rs1, rs2, rs3, frm*

rd

NV, OF, UF, NX

*if rm=111

FMSUB.D

rs1, rs2, rs3, frm*

rd

NV, OF, UF, NX

*if rm=111

FNMSUB.D

rs1, rs2, rs3, frm*

rd

NV, OF, UF, NX

*if rm=111

FNMADD.D

rs1, rs2, rs3, frm*

rd

NV, OF, UF, NX

*if rm=111

FADD.D

rs1, rs2, frm*

rd

NV, OF, NX

*if rm=111

FSUB.D

rs1, rs2, frm*

rd

NV, OF, NX

*if rm=111

FMUL.D

rs1, rs2, frm*

rd

NV, OF, UF, NX

*if rm=111

FDIV.D

rs1, rs2, frm*

rd

NV, DZ, OF, UF, NX

*if rm=111

FSQRT.D

rs1, frm*

rd

NV, NX

*if rm=111

FSGNJ.D

rs1, rs2

rd

FSGNJN.D

rs1, rs2

rd

FSGNJX.D

rs1, rs2

rd

FMIN.D

rs1, rs2

rd

NV

FMAX.D

rs1, rs2

rd

NV

FCVT.S.D

rs1, frm*

rd

NV, OF, UF, NX

*if rm=111

FCVT.D.S

rs1

rd

NV

FEQ.D

rs1, rs2

rd

NV

FLT.D

rs1, rs2

rd

NV

FLE.D

rs1, rs2

rd

NV

FCLASS.D

rs1

rd

FCVT.W.D

rs1,*

rd

NV, NX

*if rm=111

FCVT.WU.D

rs1, frm*

rd

NV, NX

*if rm=111

FCVT.D.W

rs1

rd

FCVT.D.WU

rs1

rd

- - ------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 20. RV64D Standard Extension
Source RegistersDestination RegistersAccumulating CSRs

FCVT.L.D

rs1, frm*

rd

NV, NX

*if rm=111

FCVT.LU.D

rs1, frm*

rd

NV, NX

*if rm=111

FMV.X.D

rs1

rd

FCVT.D.L

rs1, frm*

rd

NX

*if rm=111

FCVT.D.LU

rs1, frm*

rd

NX

*if rm=111

FMV.D.X

rs1

rd

-
-
-
-
-

19. "Ztso" Extension for Total Store Ordering, Version 1.0

-
- -
-
-
-

20. "CMO" Extensions for Base Cache Management Operation ISA, Version 1.0.0

-
- -
-
-
-

21. "F" Extension for Single-Precision Floating-Point, Version 2.2

-
- -
-
-
-

22. "D" Extension for Double-Precision Floating-Point, Version 2.2

-
- -
-
-
-

23. "Q" Extension for Quad-Precision Floating-Point, Version 2.2

-
- -
-
-
-

24. "Zfh" and "Zfhmin" Extensions for Half-Precision Floating-Point, Version 1.0

-
- -
-
-
-

25. "BF16" Extensions for for BFloat16-precision Floating-Point, Version 1.0

-
- -
-
-
-

26. "Zfa" Extension for Additional Floating-Point Instructions, Version 1.0

-
- -
-
-
-

27. "Zfinx", "Zdinx", "Zhinx", "Zhinxmin" Extensions for Floating-Point in Integer Registers, Version 1.0

-
- -
-
-
-

28. "C" Extension for Compressed Instructions, Version 2.0

-
-
-

This chapter describes the RISC-V standard compressed instruction-set -extension, named "C", which reduces static and dynamic code size by -adding short 16-bit instruction encodings for common operations. The C -extension can be added to any of the base ISAs (RV32, RV64, RV128), and -we use the generic term "RVC" to cover any of these. Typically, -50%-60% of the RISC-V instructions in a program can be replaced with RVC -instructions, resulting in a 25%-30% code-size reduction.

-
-
-

28.1. Overview

-
-

RVC uses a simple compression scheme that offers shorter 16-bit versions -of common 32-bit RISC-V instructions when:

-
-
-
    -
  • -

    the immediate or address offset is small, or

    -
  • -
  • -

    one of the registers is the zero register (x0), the ABI link register -(x1), or the ABI stack pointer (x2), or

    -
  • -
  • -

    the destination register and the first source register are identical, or

    -
  • -
  • -

    the registers used are the 8 most popular ones.

    -
  • -
-
-
-

The C extension is compatible with all other standard instruction -extensions. The C extension allows 16-bit instructions to be freely -intermixed with 32-bit instructions, with the latter now able to start -on any 16-bit boundary, i.e., IALIGN=16. With the addition of the C -extension, no instructions can raise instruction-address-misaligned -exceptions.

-
-
- - - - - -
- - -
-

Removing the 32-bit alignment constraint on the original 32-bit -instructions allows significantly greater code density.

-
-
-
-
-

The compressed instruction encodings are mostly common across RV32C, -RV64C, and RV128C, but as shown in Table 34, a few opcodes are used for -different purposes depending on base ISA. For example, the wider -address-space RV64C and RV128C variants require additional opcodes to -compress loads and stores of 64-bit integer values, while RV32C uses the -same opcodes to compress loads and stores of single-precision -floating-point values. Similarly, RV128C requires additional opcodes to -capture loads and stores of 128-bit integer values, while these same -opcodes are used for loads and stores of double-precision floating-point -values in RV32C and RV64C. If the C extension is implemented, the -appropriate compressed floating-point load and store instructions must -be provided whenever the relevant standard floating-point extension (F -and/or D) is also implemented. In addition, RV32C includes a compressed -jump and link instruction to compress short-range subroutine calls, -where the same opcode is used to compress ADDIW for RV64C and RV128C.

-
-
- - - - - -
- - -
-

Double-precision loads and stores are a significant fraction of static -and dynamic instructions, hence the motivation to include them in the -RV32C and RV64C encoding.

-
-
-

Although single-precision loads and stores are not a significant source -of static or dynamic compression for benchmarks compiled for the -currently supported ABIs, for microcontrollers that only provide -hardware single-precision floating-point units and have an ABI that only -supports single-precision floating-point numbers, the single-precision -loads and stores will be used at least as frequently as double-precision -loads and stores in the measured benchmarks. Hence, the motivation to -provide compressed support for these in RV32C.

-
-
-

Short-range subroutine calls are more likely in small binaries for -microcontrollers, hence the motivation to include these in RV32C.

-
-
-

Although reusing opcodes for different purposes for different base ISAs -adds some complexity to documentation, the impact on implementation -complexity is small even for designs that support multiple base ISAs. -The compressed floating-point load and store variants use the same -instruction format with the same register specifiers as the wider -integer loads and stores.

-
-
-
-
-

RVC was designed under the constraint that each RVC instruction expands -into a single 32-bit instruction in either the base ISA (RV32I/E, RV64I/E, -or RV128I) or the F and D standard extensions where present. Adopting -this constraint has two main benefits:

-
-
-
    -
  • -

    Hardware designs can simply expand RVC instructions during decode, -simplifying verification and minimizing modifications to existing -microarchitectures.

    -
  • -
  • -

    Compilers can be unaware of the RVC extension and leave code compression -to the assembler and linker, although a compression-aware compiler will -generally be able to produce better results.

    -
  • -
-
-
- - - - - -
- - -
-

We felt the multiple complexity reductions of a simple one-one mapping -between C and base IFD instructions far outweighed the potential gains -of a slightly denser encoding that added additional instructions only -supported in the C extension, or that allowed encoding of multiple IFD -instructions in one C instruction.

-
-
-
-
-

It is important to note that the C extension is not designed to be a -stand-alone ISA, and is meant to be used alongside a base ISA.

-
-
- - - - - -
- - -
-

Variable-length instruction sets have long been used to improve code -density. For example, the IBM Stretch (Buchholz, 1962), developed in the late 1950s, had -an ISA with 32-bit and 64-bit instructions, where some of the 32-bit -instructions were compressed versions of the full 64-bit instructions. -Stretch also employed the concept of limiting the set of registers that -were addressable in some of the shorter instruction formats, with short -branch instructions that could only refer to one of the index registers. -The later IBM 360 architecture (Amdahl et al., 1964) supported a simple variable-length -instruction encoding with 16-bit, 32-bit, or 48-bit instruction formats.

-
-
-

In 1963, CDC introduced the Cray-designed CDC 6600 (Thornton, 1965), a precursor to RISC -architectures, that introduced a register-rich load-store architecture -with instructions of two lengths, 15-bits and 30-bits. The later Cray-1 -design used a very similar instruction format, with 16-bit and 32-bit -instruction lengths.

-
-
-

The initial RISC ISAs from the 1980s all picked performance over code -size, which was reasonable for a workstation environment, but not for -embedded systems. Hence, both ARM and MIPS subsequently made versions of -the ISAs that offered smaller code size by offering an alternative -16-bit wide instruction set instead of the standard 32-bit wide -instructions. The compressed RISC ISAs reduced code size relative to -their starting points by about 25-30%, yielding code that was -significantly smaller than 80x86. This result surprised some, as their -intuition was that the variable-length CISC ISA should be smaller than -RISC ISAs that offered only 16-bit and 32-bit formats.

-
-
-

Since the original RISC ISAs did not leave sufficient opcode space free -to include these unplanned compressed instructions, they were instead -developed as complete new ISAs. This meant compilers needed different -code generators for the separate compressed ISAs. The first compressed -RISC ISA extensions (e.g., ARM Thumb and MIPS16) used only a fixed -16-bit instruction size, which gave good reductions in static code size -but caused an increase in dynamic instruction count, which led to lower -performance compared to the original fixed-width 32-bit instruction -size. This led to the development of a second generation of compressed -RISC ISA designs with mixed 16-bit and 32-bit instruction lengths (e.g., -ARM Thumb2, microMIPS, PowerPC VLE), so that performance was similar to -pure 32-bit instructions but with significant code size savings. -Unfortunately, these different generations of compressed ISAs are -incompatible with each other and with the original uncompressed ISA, -leading to significant complexity in documentation, implementations, and -software tools support.

-
-
-

Of the commonly used 64-bit ISAs, only PowerPC and microMIPS currently -supports a compressed instruction format. It is surprising that the most -popular 64-bit ISA for mobile platforms (ARM v8) does not include a -compressed instruction format given that static code size and dynamic -instruction fetch bandwidth are important metrics. Although static code -size is not a major concern in larger systems, instruction fetch -bandwidth can be a major bottleneck in servers running commercial -workloads, which often have a large instruction working set.

-
-
-

Benefiting from 25 years of hindsight, RISC-V was designed to support -compressed instructions from the outset, leaving enough opcode space for -RVC to be added as a simple extension on top of the base ISA (along with -many other extensions). The philosophy of RVC is to reduce code size for -embedded applications and to improve performance and energy-efficiency -for all applications due to fewer misses in the instruction cache. -Waterman shows that RVC fetches 25%-30% fewer instruction bits, which -reduces instruction cache misses by 20%-25%, or roughly the same -performance impact as doubling the instruction cache size. (Waterman, 2011)

-
-
-
-
-
-

28.2. Compressed Instruction Formats

-
-

-
-
-

Table 21 shows the nine compressed instruction -formats. CR, CI, and CSS can use any of the 32 RVI registers, but CIW, -CL, CS, CA, and CB are limited to just 8 of them. -Table 22 lists these popular registers, which -correspond to registers x8 to x15. Note that there is a separate -version of load and store instructions that use the stack pointer as the -base address register, since saving to and restoring from the stack are -so prevalent, and that they use the CI and CSS formats to allow access -to all 32 data registers. CIW supplies an 8-bit immediate for the -ADDI4SPN instruction.

-
-
- - - - - -
- - -
-

The RISC-V ABI was changed to make the frequently used registers map to -registers 'x8-x15'. This simplifies the decompression decoder by -having a contiguous naturally aligned set of register numbers, and is -also compatible with the RV32E and RV64E base ISAs, which only have 16 integer -registers.

-
-
-
-
-

Compressed register-based floating-point loads and stores also use the -CL and CS formats respectively, with the eight registers mapping to f8 to f15. -

-
-
- - - - - -
- - -
-

The standard RISC-V calling convention maps the most frequently used -floating-point registers to registers f8 to f15, which allows the -same register decompression decoding as for integer register numbers.

-
-
-
-
-

-The formats were designed to keep bits for the two register source -specifiers in the same place in all instructions, while the destination -register field can move. When the full 5-bit destination register -specifier is present, it is in the same place as in the 32-bit RISC-V -encoding. Where immediates are sign-extended, the sign extension is -always from bit 12. Immediate fields have been scrambled, as in the base -specification, to reduce the number of immediate muxes required.

-
-
- - - - - -
- - -
-

The immediate fields are scrambled in the instruction formats instead of -in sequential order so that as many bits as possible are in the same -position in every instruction, thereby simplifying implementations.

-
-
-
-
-

For many RVC instructions, zero-valued immediates are disallowed and -x0 is not a valid 5-bit register specifier. These restrictions free up -encoding space for other instructions requiring fewer operand bits.

-
- - ---- - - - - - - -
Table 21. Compressed 16-bit RVC instruction formats
---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Format

Meaning

CR

Register

CI

Immediate

CSS

Stack-relative Store

CIW

Wide Immediate

CL

Load

CS

Store

CA

Arithmetic

CB

Branch/Arithmetic

CJ

Jump

--------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

15 14 13 12

11 10 9 8 7

6 5 4 3 2

1 0

funct4

rd/rs1

rs2

op

funct3

imm

rd/rs1

imm

op

funct3

imm

rs2

op

funct3

imm

rd′

op

funct3

imm

rs1′

imm

rd′

op

funct3

imm

rs1′

imm

rs2′

op

funct6

rd′/rs1′

funct2

rs2′

op

funct3

offset

rd′/rs1′

offset

op

funct3

jump target

op

- - ---- - - - - - - -
Table 22. Registers specified by the three-bit rs1′, rs2′, and rd′ fields of the CIW, CL, CS, CA, and CB formats.
--- - - - - - - - - - - - - - - - - - -

RVC Register Number

Integer Register Number

Integer Register ABI Name

Floating-Point Register Number

Floating-Point Register ABI Name

---------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

000

001

010

011

100

101

110

111

x8

x9

x10

x11

x12

x13

x14

x15

s0

s1

a0

a1

a2

a3

a4

a5

f8

f9

f10

f11

f12

f13

f14

f15

fs0

fs1

fa0

fa1

fa2

fa3

fa4

fa5

-
-
-

28.3. Load and Store Instructions

-
-

To increase the reach of 16-bit instructions, data-transfer instructions -use zero-extended immediates that are scaled by the size of the data in -bytes: ×4 for words, ×8 for double -words, and ×16 for quad words.

-
-
-

RVC provides two variants of loads and stores. One uses the ABI stack -pointer, x2, as the base address and can target any data register. The -other can reference one of 8 base address registers and one of 8 data -registers.

-
-
-

28.3.1. Stack-Pointer-Based Loads and Stores

-
-
-Diagram -
-
-
-

These instructions use the CI format.

-
-
-

C.LWSP loads a 32-bit value from memory into register rd. It computes -an effective address by adding the zero-extended offset, scaled by 4, -to the stack pointer, x2. It expands to lw rd, offset(x2). C.LWSP is -only valid when rd≠x0 the code points with rd=x0 are reserved.

-
-
-

C.LDSP is an RV64C/RV128C-only instruction that loads a 64-bit value -from memory into register rd. It computes its effective address by -adding the zero-extended offset, scaled by 8, to the stack pointer, -x2. It expands to ld rd, offset(x2). C.LDSP is only valid when -rd≠x0 the code points with -rd=x0 are reserved.

-
-
-

C.LQSP is an RV128C-only instruction that loads a 128-bit value from -memory into register rd. It computes its effective address by adding -the zero-extended offset, scaled by 16, to the stack pointer, x2. It -expands to lq rd, offset(x2). C.LQSP is only valid when -rd≠x0 the code points with -rd=x0 are reserved.

-
-
-

C.FLWSP is an RV32FC-only instruction that loads a single-precision -floating-point value from memory into floating-point register rd. It -computes its effective address by adding the zero-extended offset, -scaled by 4, to the stack pointer, x2. It expands to -flw rd, offset(x2).

-
-
-

C.FLDSP is an RV32DC/RV64DC-only instruction that loads a -double-precision floating-point value from memory into floating-point -register rd. It computes its effective address by adding the -zero-extended offset, scaled by 8, to the stack pointer, x2. It -expands to fld rd, offset(x2).

-
-
-
-Diagram -
-
-
-

These instructions use the CSS format.

-
-
-

C.SWSP stores a 32-bit value in register rs2 to memory. It computes an -effective address by adding the zero-extended offset, scaled by 4, to -the stack pointer, x2. It expands to sw rs2, offset(x2).

-
-
-

C.SDSP is an RV64C/RV128C-only instruction that stores a 64-bit value in -register rs2 to memory. It computes an effective address by adding the -zero-extended offset, scaled by 8, to the stack pointer, x2. It -expands to sd rs2, offset(x2).

-
-
-

C.SQSP is an RV128C-only instruction that stores a 128-bit value in -register rs2 to memory. It computes an effective address by adding the -zero-extended offset, scaled by 16, to the stack pointer, x2. It -expands to sq rs2, offset(x2).

-
-
-

C.FSWSP is an RV32FC-only instruction that stores a single-precision -floating-point value in floating-point register rs2 to memory. It -computes an effective address by adding the zero-extended offset, -scaled by 4, to the stack pointer, x2. It expands to -fsw rs2, offset(x2).

-
-
-

C.FSDSP is an RV32DC/RV64DC-only instruction that stores a -double-precision floating-point value in floating-point register rs2 -to memory. It computes an effective address by adding the -zero-extended offset, scaled by 8, to the stack pointer, x2. It -expands to fsd rs2, offset(x2).

-
-
- - - - - -
- - -
-

Register save/restore code at function entry/exit represents a -significant portion of static code size. The stack-pointer-based -compressed loads and stores in RVC are effective at reducing the -save/restore static code size by a factor of 2 while improving -performance by reducing dynamic instruction bandwidth.

-
-
-

A common mechanism used in other ISAs to further reduce save/restore -code size is load-multiple and store-multiple instructions. We -considered adopting these for RISC-V but noted the following drawbacks -to these instructions:

-
-
-
    -
  • -

    These instructions complicate processor implementations.

    -
  • -
  • -

    For virtual memory systems, some data accesses could be resident in -physical memory and some could not, which requires a new restart -mechanism for partially executed instructions.

    -
  • -
  • -

    Unlike the rest of the RVC instructions, there is no IFD equivalent to -Load Multiple and Store Multiple.

    -
  • -
  • -

    Unlike the rest of the RVC instructions, the compiler would have to be -aware of these instructions to both generate the instructions and to -allocate registers in an order to maximize the chances of the them being -saved and stored, since they would be saved and restored in sequential -order.

    -
  • -
  • -

    Simple microarchitectural implementations will constrain how other -instructions can be scheduled around the load and store multiple -instructions, leading to a potential performance loss.

    -
  • -
  • -

    The desire for sequential register allocation might conflict with the -featured registers selected for the CIW, CL, CS, CA, and CB formats.

    -
  • -
-
-
-

Furthermore, much of the gains can be realized in software by replacing -prologue and epilogue code with subroutine calls to common prologue and -epilogue code, a technique described in Section 5.6 of (Waterman, 2016).

-
-
-

While reasonable architects might come to different conclusions, we -decided to omit load and store multiple and instead use the -software-only approach of calling save/restore millicode routines to -attain the greatest code size reduction.

-
-
-
-
-
-

28.3.2. Register-Based Loads and Stores

-
-
-Diagram -
-
-
-

-These instructions use the CL format.

-
-
-

C.LW loads a 32-bit value from memory into register -rd′. It computes an effective address by adding the -zero-extended offset, scaled by 4, to the base address in register -rs1′. It expands to lw rd′, offset(rs1′).

-
-
-

C.LD is an RV64C/RV128C-only instruction that loads a 64-bit value from -memory into register rd′. It computes an effective -address by adding the zero-extended offset, scaled by 8, to the base -address in register rs1′. It expands to -ld rd′, offset(rs1′).

-
-
-

C.LQ is an RV128C-only instruction that loads a 128-bit value from -memory into register rd′. It computes an effective -address by adding the zero-extended offset, scaled by 16, to the base -address in register rs1′. It expands to -lq rd′, offset(rs1′).

-
-
-

C.FLW is an RV32FC-only instruction that loads a single-precision -floating-point value from memory into floating-point register -rd′. It computes an effective address by adding the -zero-extended offset, scaled by 4, to the base address in register -rs1′. It expands to -flw rd′, offset(rs1′).

-
-
-

C.FLD is an RV32DC/RV64DC-only instruction that loads a double-precision -floating-point value from memory into floating-point register -rd′. It computes an effective address by adding the -zero-extended offset, scaled by 8, to the base address in register -rs1′. It expands to -fld rd′, offset(rs1′).

-
-
-
-Diagram -
-
-
-

-
-
-

These instructions use the CS format.

-
-
-

C.SW stores a 32-bit value in register rs2′ to memory. -It computes an effective address by adding the zero-extended offset, -scaled by 4, to the base address in register rs1′. It -expands to sw rs2′, offset(rs1′).

-
-
-

C.SD is an RV64C/RV128C-only instruction that stores a 64-bit value in -register rs2′ to memory. It computes an effective -address by adding the zero-extended offset, scaled by 8, to the base -address in register rs1′. It expands to -sd rs2′, offset(rs1′).

-
-
-

C.SQ is an RV128C-only instruction that stores a 128-bit value in -register rs2′ to memory. It computes an effective -address by adding the zero-extended offset, scaled by 16, to the base -address in register rs1′. It expands to -sq rs2′, offset(rs1′).

-
-
-

C.FSW is an RV32FC-only instruction that stores a single-precision -floating-point value in floating-point register rs2′ to -memory. It computes an effective address by adding the zero-extended -offset, scaled by 4, to the base address in register -rs1′. It expands to -fsw rs2′, offset(rs1′).

-
-
-

C.FSD is an RV32DC/RV64DC-only instruction that stores a -double-precision floating-point value in floating-point register -rs2′ to memory. It computes an effective address by -adding the zero-extended offset, scaled by 8, to the base address in -register rs1′. It expands to -fsd rs2′, offset(rs1′).

-
-
-
-
-

28.4. Control Transfer Instructions

-
-

RVC provides unconditional jump instructions and conditional branch -instructions. As with base RVI instructions, the offsets of all RVC -control transfer instructions are in multiples of 2 bytes.

-
-
-
-Diagram -
-
-
-

-
-
-

These instructions use the CJ format.

-
-
-

C.J performs an unconditional control transfer. The offset is -sign-extended and added to the pc to form the jump target address. C.J -can therefore target a ±2 KiB range. C.J expands to -jal x0, offset.

-
-
-

C.JAL is an RV32C-only instruction that performs the same operation as -C.J, but additionally writes the address of the instruction following -the jump (pc+2) to the link register, x1. C.JAL expands to -jal x1, offset.

-
-
-
-Diagram -
-
-
-

-
-
-

These instructions use the CR format.

-
-
-

C.JR (jump register) performs an unconditional control transfer to the -address in register rs1. C.JR expands to jalr x0, 0(rs1). C.JR is -only valid when stem 7911d76fb533bd8710d50a065c53f749; the code -point with stem 89ec63a1940db41477e8738c3a1f7b05 is reserved.

-
-
-

C.JALR (jump and link register) performs the same operation as C.JR, but -additionally writes the address of the instruction following the jump -(pc+2) to the link register, x1. C.JALR expands to -jalr x1, 0(rs1). C.JALR is only valid when -stem 7911d76fb533bd8710d50a065c53f749; the code point with -stem 89ec63a1940db41477e8738c3a1f7b05 corresponds to the C.EBREAK -instruction.

-
-
- - - - - -
- - -
-

Strictly speaking, C.JALR does not expand exactly to a base RVI -instruction as the value added to the PC to form the link address is 2 -rather than 4 as in the base ISA, but supporting both offsets of 2 and 4 -bytes is only a very minor change to the base microarchitecture.

-
-
-
-
-
-Diagram -
-
-
-

-
-
-

These instructions use the CB format.

-
-
-

C.BEQZ performs conditional control transfers. The offset is -sign-extended and added to the pc to form the branch target address. -It can therefore target a ±256 B range. C.BEQZ takes the -branch if the value in register rs1′ is zero. It -expands to beq rs1′, x0, offset.

-
-
-

C.BNEZ is defined analogously, but it takes the branch if -rs1′ contains a nonzero value. It expands to -bne rs1′, x0, offset.

-
-
-
-

28.5. Integer Computational Instructions

-
-

RVC provides several instructions for integer arithmetic and constant -generation.

-
-
-

28.5.1. Integer Constant-Generation Instructions

-
-

The two constant-generation instructions both use the CI instruction -format and can target any integer register.

-
-
-
-Diagram -
-
-
-

-
-
-

C.LI loads the sign-extended 6-bit immediate, imm, into register rd. -C.LI expands into addi rd, x0, imm. C.LI is only valid when -rd≠x0; the code points with rd=x0 encode HINTs.

-
-
-

C.LUI loads the non-zero 6-bit immediate field into bits 17–12 of the -destination register, clears the bottom 12 bits, and sign-extends bit 17 -into all higher bits of the destination. C.LUI expands into -lui rd, imm. C.LUI is only valid when -stem 966f16482ee6eca6c3e071e1950d4dd4, -and when the immediate is not equal to zero. The code points with -imm=0 are reserved; the remaining code points with rd=x0 are -HINTs; and the remaining code points with rd=x2 correspond to the -C.ADDI16SP instruction.

-
-
-
-

28.5.2. Integer Register-Immediate Operations

-
-

These integer register-immediate operations are encoded in the CI format -and perform operations on an integer register and a 6-bit immediate.

-
-
-
-Diagram -
-
-
-

-
-
-

C.ADDI adds the non-zero sign-extended 6-bit immediate to the value in -register rd then writes the result to rd. C.ADDI expands into -addi rd, rd, imm. C.ADDI is only valid when -rd≠x0 and imm≠0. The code -points with rd=x0 encode the C.NOP instruction; the remaining code -points with imm=0 encode HINTs.

-
-
-

C.ADDIW is an RV64C/RV128C-only instruction that performs the same -computation but produces a 32-bit result, then sign-extends result to 64 -bits. C.ADDIW expands into addiw rd, rd, imm. The immediate can be -zero for C.ADDIW, where this corresponds to sext.w rd. C.ADDIW is -only valid when rd≠x0; the code points with -rd=x0 are reserved.

-
-
-

C.ADDI16SP shares the opcode with C.LUI, but has a destination field of -x2. C.ADDI16SP adds the non-zero sign-extended 6-bit immediate to the -value in the stack pointer (sp=x2), where the immediate is scaled to -represent multiples of 16 in the range (-512,496). C.ADDI16SP is used to -adjust the stack pointer in procedure prologues and epilogues. It -expands into addi x2, x2, nzimm[9:4]. C.ADDI16SP is only valid when -nzimm≠0; the code point with nzimm=0 is reserved.

-
-
- - - - - -
- - -
-

In the standard RISC-V calling convention, the stack pointer sp is -always 16-byte aligned.

-
-
-
-
-
-Diagram -
-
-
-

-C.ADDI4SPN is a CIW-format instruction that adds a zero-extended -non-zero immediate, scaled by 4, to the stack pointer, x2, and writes -the result to rd′. This instruction is used to generate -pointers to stack-allocated variables, and expands to -addi rd′, x2, nzuimm[9:2]. C.ADDI4SPN is only valid when -nzuimm≠0; the code points with nzuimm=0 are -reserved.

-
-
-
-Diagram -
-
-
-

-
-
-

C.SLLI is a CI-format instruction that performs a logical left shift of -the value in register rd then writes the result to rd. The shift -amount is encoded in the shamt field. For RV128C, a shift amount of -zero is used to encode a shift of 64. C.SLLI expands into -slli rd, rd, shamt[5:0], except for RV128C with shamt=0, which expands to -slli rd, rd, 64.

-
-
-

For RV32C, shamt[5] must be zero; the code points with shamt[5]=1 -are designated for custom extensions. For RV32C and RV64C, the shift -amount must be non-zero; the code points with shamt=0 are HINTs. For -all base ISAs, the code points with rd=x0 are HINTs, except those -with shamt[5]=1 in RV32C.

-
-
-
-Diagram -
-
-
-

-
-
-

C.SRLI is a CB-format instruction that performs a logical right shift of -the value in register rd′ then writes the result to -rd′. The shift amount is encoded in the shamt field. -For RV128C, a shift amount of zero is used to encode a shift of 64. -Furthermore, the shift amount is sign-extended for RV128C, and so the -legal shift amounts are 1-31, 64, and 96-127. C.SRLI expands into -srli rd′, rd′, shamt, except for -RV128C with shamt=0, which expands to -srli rd′, rd′, 64.

-
-
-

For RV32C, shamt[5] must be zero; the code points with shamt[5]=1 -are designated for custom extensions. For RV32C and RV64C, the shift -amount must be non-zero; the code points with shamt=0 are HINTs.

-
-
-

C.SRAI is defined analogously to C.SRLI, but instead performs an -arithmetic right shift. C.SRAI expands to -srai rd′, rd′, shamt.

-
-
- - - - - -
- - -
-

Left shifts are usually more frequent than right shifts, as left shifts -are frequently used to scale address values. Right shifts have therefore -been granted less encoding space and are placed in an encoding quadrant -where all other immediates are sign-extended. For RV128, the decision -was made to have the 6-bit shift-amount immediate also be sign-extended. -Apart from reducing the decode complexity, we believe right-shift -amounts of 96-127 will be more useful than 64-95, to allow extraction of -tags located in the high portions of 128-bit address pointers. We note -that RV128C will not be frozen at the same point as RV32C and RV64C, to -allow evaluation of typical usage of 128-bit address-space codes.

-
-
-
-
-
-Diagram -
-
-
-

-
-
-

C.ANDI is a CB-format instruction that computes the bitwise AND of the -value in register rd′ and the sign-extended 6-bit -immediate, then writes the result to rd′. C.ANDI -expands to andi rd′, rd′, imm.

-
-
-
-

28.5.3. Integer Register-Register Operations

-
-
-Diagram -
-
-
-

-These instructions use the CR format.

-
-
-

C.MV copies the value in register rs2 into register rd. C.MV expands -into add rd, x0, rs2. C.MV is only valid when -rs2≠x0 the code points with rs2=x0 correspond to the C.JR instruction. The code points with rs2≠x0 and rd=x0 are HINTs.

-
-
- - - - - -
- - -
-

C.MV expands to a different instruction than the canonical MV -pseudoinstruction, which instead uses ADDI. Implementations that handle -MV specially, e.g. using register-renaming hardware, may find it more -convenient to expand C.MV to MV instead of ADD, at slight additional -hardware cost.

-
-
-
-
-

C.ADD adds the values in registers rd and rs2 and writes the result -to register rd. C.ADD expands into add rd, rd, rs2. C.ADD is only -valid when rs2≠x0 the code points with rs2=x0 correspond to the C.JALR -and C.EBREAK instructions. The code points with rs2≠x0 and rd=x0 are HINTs.

-
-
-
-Diagram -
-
-
-

-
-
-

These instructions use the CA format.

-
-
-

C.AND computes the bitwise AND of the values in registers -rd′ and rs2′, then writes the result -to register rd′. C.AND expands into -and rd′, rd′, rs2′.

-
-
-

C.OR computes the bitwise OR of the values in registers -rd′ and rs2′, then writes the result -to register rd′. C.OR expands into -or rd′, rd′, rs2′.

-
-
-

C.XOR computes the bitwise XOR of the values in registers -rd′ and rs2′, then writes the result -to register rd′. C.XOR expands into -xor rd′, rd′, rs2′.

-
-
-

C.SUB subtracts the value in register rs2′ from the -value in register rd′, then writes the result to -register rd′. C.SUB expands into -sub rd′, rd′, rs2′.

-
-
-

C.ADDW is an RV64C/RV128C-only instruction that adds the values in -registers rd′ and rs2′, then -sign-extends the lower 32 bits of the sum before writing the result to -register rd′. C.ADDW expands into -addw rd′, rd′, rs2′.

-
-
-

C.SUBW is an RV64C/RV128C-only instruction that subtracts the value in -register rs2′ from the value in register -rd′, then sign-extends the lower 32 bits of the -difference before writing the result to register rd′. -C.SUBW expands into subw rd′, rd′, rs2′.

-
-
- - - - - -
- - -
-

This group of six instructions do not provide large savings -individually, but do not occupy much encoding space and are -straightforward to implement, and as a group provide a worthwhile -improvement in static and dynamic compression.

-
-
-
-
-
-

28.5.4. Defined Illegal Instruction

-
-
-Diagram -
-
-
-

-
-
-

A 16-bit instruction with all bits zero is permanently reserved as an -illegal instruction.

-
-
- - - - - -
- - -
-

We reserve all-zero instructions to be illegal instructions to help trap -attempts to execute zero-ed or non-existent portions of the memory -space. The all-zero value should not be redefined in any non-standard -extension. Similarly, we reserve instructions with all bits set to 1 -(corresponding to very long instructions in the RISC-V variable-length -encoding scheme) as illegal to capture another common value seen in -non-existent memory regions.

-
-
-
-
-
-

28.5.5. NOP Instruction

-
-
-Diagram -
-
-
-

-
-
-

C.NOP is a CI-format instruction that does not change any user-visible -state, except for advancing the pc and incrementing any applicable -performance counters. C.NOP expands to nop. C.NOP is only valid when -imm=0; the code points with imm≠0 encode HINTs.

-
-
-
-

28.5.6. Breakpoint Instruction

-
-
-Diagram -
-
-
-

-
-
-

Debuggers can use the C.EBREAK instruction, which expands to ebreak, -to cause control to be transferred back to the debugging environment. -C.EBREAK shares the opcode with the C.ADD instruction, but with rd and -rs2 both zero, thus can also use the CR format.

-
-
-
-
-

28.6. Usage of C Instructions in LR/SC Sequences

-
-

On implementations that support the C extension, compressed forms of the -I instructions permitted inside constrained LR/SC sequences, as -described in [sec:lrscseq], are also permitted -inside constrained LR/SC sequences.

-
-
- - - - - -
- - -
-

The implication is that any implementation that claims to support both -the A and C extensions must ensure that LR/SC sequences containing valid -C instructions will eventually complete.

-
-
-
-
-
-

28.7. HINT Instructions

-
-

A portion of the RVC encoding space is reserved for microarchitectural -HINTs. Like the HINTs in the RV32I base ISA (see -HINT Instructions), these instructions do not -modify any architectural state, except for advancing the pc and any -applicable performance counters. HINTs are executed as no-ops on -implementations that ignore them.

-
-
-

RVC HINTs are encoded as computational instructions that do not modify -the architectural state, either because rd=x0 (e.g. -C.ADD x0, t0), or because rd is overwritten with a copy of itself -(e.g. C.ADDI t0, 0).

-
-
- - - - - -
- - -
-

This HINT encoding has been chosen so that simple implementations can -ignore HINTs altogether, and instead execute a HINT as a regular -computational instruction that happens not to mutate the architectural -state.

-
-
-
-
-

RVC HINTs do not necessarily expand to their RVI HINT counterparts. For -example, C.ADD x0, a0 might not encode the same HINT as -ADD x0, x0, a0.

-
-
- - - - - -
- - -
-

The primary reason to not require an RVC HINT to expand to an RVI HINT -is that HINTs are unlikely to be compressible in the same manner as the -underlying computational instruction. Also, decoupling the RVC and RVI -HINT mappings allows the scarce RVC HINT space to be allocated to the -most popular HINTs, and in particular, to HINTs that are amenable to -macro-op fusion.

-
-
-
-
-

Table 32 lists all RVC HINT code points. For RV32C, 78% -of the HINT space is reserved for standard HINTs. The remainder of the HINT space is designated for custom HINTs; -no standard HINTs will ever be defined in this subspace.

-
- - ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 23. RVC HINT instructions.
InstructionConstraintsCode PointsPurpose

C.NOP

imm≠0

63

Designated for future standard use

C.ADDI

rdx0, imm=0

31

C.LI

rd=x0

64

C.LUI

rd=x0, imm≠0

63

C.MV

rd=x0, rs2x0

31

C.ADD

rd=x0, rs2x0, rs2x2-x5

27

C.ADD

rd=x0, rs2x2-x5

4

(rs2=x2) C.NTL.P1 (rs2=x3) C.NTL.PALL (rs2=x4) C.NTL.S1 (rs2=x5) C.NTL.ALL

C.SLLI

rd=x0, imm≠0

31 (RV32), 63 (RV64/128)

Designated for custom use

C.SLLI64

rd=x0

1

C.SLLI64

rdx0, RV32 and RV64 only

31

C.SRLI64

RV32 and RV64 only

8

C.SRAI64

RV32 and RV64 only

8

-
-
-

28.8. RVC Instruction Set Listings

-
-

Table 24 shows a map of the major -opcodes for RVC. Each row of the table corresponds to one quadrant of -the encoding space. The last quadrant, which has the two -least-significant bits set, corresponds to instructions wider than 16 -bits, including those in the base ISAs. Several instructions are only -valid for certain operands; when invalid, they are marked either RES -to indicate that the opcode is reserved for future standard extensions; -Custom to indicate that the opcode is designated for custom -extensions; or HINT to indicate that the opcode is reserved for -microarchitectural hints (see Section 18.7).

-
-
- - ------------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 24. RVC opcode map instructions.

inst[15:13]
-inst[1:0]

000

001

010

011

100

101

110

111

00

ADDI4SPN

FLD
-FLD
-LQ

LW

FLW
-LD
-LD

Reserved

FSD
-FSD
-SQ

SW

FSW
-SD
-SD

RV32
-RV64
-RV128

01

ADDI

JAL
-ADDIW
-ADDIW

LI

LUI/ADDI16SP

MISC-ALU

J

BEQZ

BNEZ

RV32
-RV64
-RV128

10

SLLI

FLDSP
-FLDSP
-LQSP

LWSP

FLWSP
-LDSP
-LDSP

J[AL]R/MV/ADD

FSDSP
-FSDSP
-SQSP

SWSP

FSWSP
-SDSP
-SDSP

RV32
-RV64
-RV128

11

>16b

-
-

Figure 2, Figure 3, and Figure 4 list the RVC instructions.

-
-
-
-Diagram -
-
Figure 2. Instruction listing for RVC, Quadrant 0
-
-
-
-Diagram -
-
Figure 3. Instruction listing for RVC, Quadrant 1
-
-
-
-Diagram -
-
Figure 4. Instruction listing for RVC, Quadrant 2
-
-
-
-
-
-

29. "Zc*" Extension for Code Size Reduction, Version 1.0.0

-
-
-

29.1. Zc* Overview

-
-

Zc* is a group of extensions that define subsets of the existing C extension (Zca, Zcd, Zcf) and new extensions which only contain 16-bit encodings.

-
-
-

Zcm* all reuse the encodings for c.fld, c.fsd, c.fldsp, c.fsdsp.

-
- - --------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 25. Zc* extension overview
InstructionZcaZcfZcdZcbZcmpZcmt

The Zca extension is added as way to refer to instructions in the C extension that do not include the floating-point loads and stores

C excl. c.f*

yes

The Zcf extension is added as a way to refer to compressed single-precision floating-point load/stores

c.flw

rv32

c.flwsp

rv32

c.fsw

rv32

c.fswsp

rv32

The Zcd extension is added as a way to refer to compressed double-precision floating-point load/stores

c.fld

yes

c.fldsp

yes

c.fsd

yes

c.fsdsp

yes

Simple operations for use on all architectures

c.lbu

yes

c.lh

yes

c.lhu

yes

c.sb

yes

c.sh

yes

c.zext.b

yes

c.sext.b

yes

c.zext.h

yes

c.sext.h

yes

c.zext.w

yes

c.mul

yes

c.not

yes

PUSH/POP and double move which overlap with c.fsdsp. Complex operations intended for embedded CPUs

cm.push

yes

cm.pop

yes

cm.popret

yes

cm.popretz

yes

cm.mva01s

yes

cm.mvsa01

yes

Table jump which overlaps with c.fsdsp. Complex operations intended for embedded CPUs

cm.jt

yes

cm.jalt

yes

-
-
-

29.2. C

-
-

The C extension is the superset of the following extensions:

-
-
-
    -
  • -

    Zca

    -
  • -
  • -

    Zcf if F is specified (RV32 only)

    -
  • -
  • -

    Zcd if D is specified

    -
  • -
-
-
-

As C defines the same instructions as Zca, Zcf and Zcd, the rule is that:

-
-
-
    -
  • -

    C always implies Zca

    -
  • -
  • -

    C+F implies Zcf (RV32 only)

    -
  • -
  • -

    C+D implies Zcd

    -
  • -
-
-
-
-

29.3. Zce

-
-

The Zce extension is intended to be used for microcontrollers, and includes all relevant Zc extensions.

-
-
-
    -
  • -

    Specifying Zce on RV32 without F includes Zca, Zcb, Zcmp, Zcmt

    -
  • -
  • -

    Specifying Zce on RV32 with F includes Zca, Zcb, Zcmp, Zcmt and Zcf

    -
  • -
  • -

    Specifying Zce on RV64 always includes Zca, Zcb, Zcmp, Zcmt

    -
    -
      -
    • -

      Zcf doesn’t exist for RV64

      -
    • -
    -
    -
  • -
-
-
-

Therefore common ISA strings can be updated as follows to include the relevant Zc extensions, for example:

-
-
-
    -
  • -

    RV32IMC becomes RV32IM_Zce

    -
  • -
  • -

    RV32IMCF becomes RV32IMF_Zce

    -
  • -
-
-
-
-

29.4. MISA.C

-
-

MISA.C is set if the following extensions are selected:

-
-
-
    -
  • -

    Zca and not F

    -
  • -
  • -

    Zca, Zcf and F is specified (RV32 only)

    -
  • -
  • -

    Zca, Zcf and Zcd if D is specified (RV32 only)

    -
    -
      -
    • -

      this configuration excludes Zcmp, Zcmt

      -
    • -
    -
    -
  • -
  • -

    Zca, Zcd if D is specified (RV64 only)

    -
    -
      -
    • -

      this configuration excludes Zcmp, Zcmt

      -
    • -
    -
    -
  • -
-
-
-
-

29.5. Zca

-
-

The Zca extension is added as way to refer to instructions in the C extension that do not include the floating-point loads and stores.

-
-
-

Therefore it excluded all 16-bit floating point loads and stores: c.flw, c.flwsp, c.fsw, c.fswsp, c.fld, c.fldsp, c.fsd, c.fsdsp.

-
-
- - - - - -
- - -
-

the C extension only includes F/D instructions when D and F are also specified

-
-
-
-
-
-

29.6. Zcf (RV32 only)

-
-

Zcf is the existing set of compressed single precision floating point loads and stores: c.flw, c.flwsp, c.fsw, c.fswsp.

-
-
-

Zcf is only relevant to RV32, it cannot be specified for RV64.

-
-
-

The Zcf extension depends on the Zca and F extensions.

-
-
-
-

29.7. Zcd

-
-

Zcd is the existing set of compressed double precision floating point loads and stores: c.fld, c.fldsp, c.fsd, c.fsdsp.

-
-
-

The Zcd extension depends on the Zca and D extensions.

-
-
-
-

29.8. Zcb

-
-

Zcb has simple code-size saving instructions which are easy to implement on all CPUs.

-
-
-

All encodings are currently reserved for all architectures, and have no conflicts with any existing extensions.

-
-
- - - - - -
- - -Zcb can be implemented on any CPU as the instructions are 16-bit versions of existing 32-bit instructions from the application class profile. -
-
-
-

The Zcb extension depends on the Zca extension.

-
-
-

As shown on the individual instruction pages, many of the instructions in Zcb depend upon another extension being implemented. For example, c.mul is only implemented if M or Zmmul is implemented, and c.sext.b is only implemented if Zbb is implemented.

-
-
-

The c.mul encoding uses the CA register format along with other instructions such as c.sub, c.xor etc.

-
-
- - - - - -
- - - c.sext.w is a pseudoinstruction for c.addiw rd, 0 (RV64) -
-
- ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
RV32RV64MnemonicInstruction

yes

yes

c.lbu rd', uimm(rs1')

Load unsigned byte, 16-bit encoding

yes

yes

c.lhu rd', uimm(rs1')

Load unsigned halfword, 16-bit encoding

yes

yes

c.lh rd', uimm(rs1')

Load signed halfword, 16-bit encoding

yes

yes

c.sb rs2', uimm(rs1')

Store byte, 16-bit encoding

yes

yes

c.sh rs2', uimm(rs1')

Store halfword, 16-bit encoding

yes

yes

c.zext.b rsd'

Zero extend byte, 16-bit encoding

yes

yes

c.sext.b rsd'

Sign extend byte, 16-bit encoding

yes

yes

c.zext.h rsd'

Zero extend halfword, 16-bit encoding

yes

yes

c.sext.h rsd'

Sign extend halfword, 16-bit encoding

yes

c.zext.w rsd'

Zero extend word, 16-bit encoding

yes

yes

c.not rsd'

Bitwise not, 16-bit encoding

yes

yes

c.mul rsd', rs2'

Multiply, 16-bit encoding

-
-
-
-

29.9. Zcmp

-
-

The Zcmp extension is a set of instructions which may be executed as a series of existing 32-bit RISC-V instructions.

-
-
-

This extension reuses some encodings from c.fsdsp. Therefore it is incompatible with Zcd, - which is included when C and D extensions are both present.

-
-
- - - - - -
- - -Zcmp is primarily targeted at embedded class CPUs due to implementation complexity. Additionally, it is not compatible with architecture class profiles. -
-
-
-

The Zcmp extension depends on the Zca extension.

-
-
-

The PUSH/POP assembly syntax uses several variables, the meaning of which are:

-
-
-
    -
  • -

    reg_list is a list containing 1 to 13 registers (ra and 0 to 12 s registers)

    -
    -
      -
    • -

      valid values: {ra}, {ra, s0}, {ra, s0-s1}, {ra, s0-s2}, …​, {ra, s0-s8}, {ra, s0-s9}, {ra, s0-s11}

      -
    • -
    • -

      note that {ra, s0-s10} is not valid, giving 12 lists not 13 for better encoding

      -
    • -
    -
    -
  • -
  • -

    stack_adj is the total size of the stack frame.

    -
    -
      -
    • -

      valid values vary with register list length and the specific encoding, see the instruction pages for details.

      -
    • -
    -
    -
  • -
-
- ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
RV32RV64MnemonicInstruction

yes

yes

cm.push {reg_list}, -stack_adj

cm.push

yes

yes

cm.pop {reg_list}, stack_adj

cm.pop

yes

yes

cm.popret {reg_list}, stack_adj

cm.popret

yes

yes

cm.popretz {reg_list}, stack_adj

cm.popretz

yes

yes

cm.mva01s rs1', rs2'

Move two s0-s7 registers into a0-a1

yes

yes

cm.mvsa01 r1s', r2s'

Move a0-a1 into two different s0-s7 registers

-
-
-
-

29.10. Zcmt

-
-

Zcmt adds the table jump instructions and also adds the jvt CSR. The jvt CSR requires a -state enable if Smstateen is implemented. See jvt CSR, table jump base vector and control register for details.

-
-
-

This extension reuses some encodings from c.fsdsp. Therefore it is incompatible with Zcd, - which is included when C and D extensions are both present.

-
-
- - - - - -
- - -Zcmt is primarily targeted at embedded class CPUs due to implementation complexity. Additionally, it is not compatible with RVA profiles. -
-
-
-

The Zcmt extension depends on the Zca and Zicsr extensions.

-
- ------ - - - - - - - - - - - - - - - - - - - - - - -
RV32RV64MnemonicInstruction

yes

yes

cm.jt index

Jump via table

yes

yes

cm.jalt index

Jump and link via table

-
-
-

29.11. Zc instruction formats

-
-

Several instructions in this specification use the following new instruction formats.

-
- --------------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Formatinstructions15:109876543210

CLB

c.lbu

funct6

rs1'

uimm

rd'

op

CSB

c.sb

funct6

rs1'

uimm

rs2'

op

CLH

c.lhu, c.lh

funct6

rs1'

funct1

uimm

rd'

op

CSH

c.sh

funct6

rs1'

funct1

uimm

rs2'

op

CU

c.[sz]ext.*, c.not

funct6

rd'/rs1'

funct5

op

CMMV

cm.mvsa01 cm.mva01s

funct6

r1s'

funct2

r2s'

op

CMJT

cm.jt cm.jalt

funct6

index

op

CMPP

cm.push*, cm.pop*

funct6

funct2

urlist

spimm

op

-
- - - - - -
- - -
-

c.mul uses the existing CA format.

-
-
-
-
-
-
-

29.12. Zcb instructions

-
-

29.12.1. c.lbu

-
-

Synopsis:

-
-
-

Load unsigned byte, 16-bit encoding

-
-
-

Mnemonic:

-
-
-

c.lbu rd', uimm(rs1')

-
-
-

Encoding (RV32, RV64):

-
-
-
-Diagram -
-
-
-

The immediate offset is formed as follows:

-
-
-
-
  uimm[31:2] = 0;
-  uimm[1]    = encoding[5];
-  uimm[0]    = encoding[6];
-
-
-
-

Description:

-
-
-

This instruction loads a byte from the memory address formed by adding rs1' to the zero extended immediate uimm. The resulting byte is zero extended to XLEN bits and is written to rd'.

-
-
- - - - - -
- - -
-

rd' and rs1' are from the standard 8-register set x8-x15.

-
-
-
-
-

Prerequisites:

-
-
-

None

-
-
-

Operation:

-
-
-
-
//This is not SAIL, it's pseudocode. The SAIL hasn't been written yet.
-
-X(rdc) = EXTZ(mem[X(rs1c)+EXTZ(uimm)][7..0]);
-
-
-
-
-
-

29.12.2. c.lhu

-
-

Synopsis:

-
-
-

Load unsigned halfword, 16-bit encoding

-
-
-

Mnemonic:

-
-
-

c.lhu rd', uimm(rs1')

-
-
-

Encoding (RV32, RV64):

-
-
-
-Diagram -
-
-
-

The immediate offset is formed as follows:

-
-
-
-
  uimm[31:2] = 0;
-  uimm[1]    = encoding[5];
-  uimm[0]    = 0;
-
-
-
-

Description:

-
-
-

This instruction loads a halfword from the memory address formed by adding rs1' to the zero extended immediate uimm. The resulting halfword is zero extended to XLEN bits and is written to rd'.

-
-
- - - - - -
- - -
-

rd' and rs1' are from the standard 8-register set x8-x15.

-
-
-
-
-

Prerequisites:

-
-
-

None

-
-
-

Operation:

-
-
-
-
//This is not SAIL, it's pseudocode. The SAIL hasn't been written yet.
-
-X(rdc) = EXTZ(load_mem[X(rs1c)+EXTZ(uimm)][15..0]);
-
-
-
-
-
-

29.12.3. c.lh

-
-

Synopsis:

-
-
-

Load signed halfword, 16-bit encoding

-
-
-

Mnemonic:

-
-
-

c.lh rd', uimm(rs1')

-
-
-

Encoding (RV32, RV64):

-
-
-
-Diagram -
-
-
-

The immediate offset is formed as follows:

-
-
-
-
  uimm[31:2] = 0;
-  uimm[1]    = encoding[5];
-  uimm[0]    = 0;
-
-
-
-

Description:

-
-
-

This instruction loads a halfword from the memory address formed by adding rs1' to the zero extended immediate uimm. The resulting halfword is sign extended to XLEN bits and is written to rd'.

-
-
- - - - - -
- - -
-

rd' and rs1' are from the standard 8-register set x8-x15.

-
-
-
-
-

Prerequisites:

-
-
-

None

-
-
-

Operation:

-
-
-
-
//This is not SAIL, it's pseudocode. The SAIL hasn't been written yet.
-
-X(rdc) = EXTS(load_mem[X(rs1c)+EXTZ(uimm)][15..0]);
-
-
-
-
-
-

29.12.4. c.sb

-
-

Synopsis:

-
-
-

Store byte, 16-bit encoding

-
-
-

Mnemonic:

-
-
-

c.sb rs2', uimm(rs1')

-
-
-

Encoding (RV32, RV64):

-
-
-
-Diagram -
-
-
-

The immediate offset is formed as follows:

-
-
-
-
  uimm[31:2] = 0;
-  uimm[1]    = encoding[5];
-  uimm[0]    = encoding[6];
-
-
-
-

Description:

-
-
-

This instruction stores the least significant byte of rs2' to the memory address formed by adding rs1' to the zero extended immediate uimm.

-
-
- - - - - -
- - -
-

rs1' and rs2' are from the standard 8-register set x8-x15.

-
-
-
-
-

Prerequisites:

-
-
-

None

-
-
-

Operation:

-
-
-
-
//This is not SAIL, it's pseudocode. The SAIL hasn't been written yet.
-
-mem[X(rs1c)+EXTZ(uimm)][7..0] = X(rs2c)
-
-
-
-
-
-

29.12.5. c.sh

-
-

Synopsis:

-
-
-

Store halfword, 16-bit encoding

-
-
-

Mnemonic:

-
-
-

c.sh rs2', uimm(rs1')

-
-
-

Encoding (RV32, RV64):

-
-
-
-Diagram -
-
-
-

The immediate offset is formed as follows:

-
-
-
-
  uimm[31:2] = 0;
-  uimm[1]    = encoding[5];
-  uimm[0]    = 0;
-
-
-
-

Description:

-
-
-

This instruction stores the least significant halfword of rs2' to the memory address formed by adding rs1' to the zero extended immediate uimm.

-
-
- - - - - -
- - -
-

rs1' and rs2' are from the standard 8-register set x8-x15.

-
-
-
-
-

Prerequisites:

-
-
-

None

-
-
-

Operation:

-
-
-
-
//This is not SAIL, it's pseudocode. The SAIL hasn't been written yet.
-
-mem[X(rs1c)+EXTZ(uimm)][15..0] = X(rs2c)
-
-
-
-
-
-

29.12.6. c.zext.b

-
-

Synopsis:

-
-
-

Zero extend byte, 16-bit encoding

-
-
-

Mnemonic:

-
-
-

c.zext.b rd'/rs1'

-
-
-

Encoding (RV32, RV64):

-
-
-
-Diagram -
-
-
-

Description:

-
-
-

This instruction takes a single source/destination operand. -It zero-extends the least-significant byte of the operand to XLEN bits by inserting zeros into all of -the bits more significant than 7.

-
-
- - - - - -
- - -
-

rd'/rs1' is from the standard 8-register set x8-x15.

-
-
-
-
-

Prerequisites:

-
-
-

None

-
-
-

32-bit equivalent:

-
-
-
-
andi rd'/rs1', rd'/rs1', 0xff
-
-
-
- - - - - -
- - -
-

The SAIL module variable for rd'/rs1' is called rsdc.

-
-
-
-
-

Operation:

-
-
-
-
X(rsdc) = EXTZ(X(rsdc)[7..0]);
-
-
-
-
-
-

29.12.7. c.sext.b

-
-

Synopsis:

-
-
-

Sign extend byte, 16-bit encoding

-
-
-

Mnemonic:

-
-
-

c.sext.b rd'/rs1'

-
-
-

Encoding (RV32, RV64):

-
-
-
-Diagram -
-
-
-

Description:

-
-
-

This instruction takes a single source/destination operand. -It sign-extends the least-significant byte in the operand to XLEN bits by copying the most-significant bit -in the byte (i.e., bit 7) to all of the more-significant bits.

-
-
- - - - - -
- - -
-

rd'/rs1' is from the standard 8-register set x8-x15.

-
-
-
-
-

Prerequisites:

-
-
-

Zbb is also required.

-
-
- - - - - -
- - -The SAIL module variable for rd'/rs1' is called rsdc. -
-
-
-

Operation:

-
-
-
-
X(rsdc) = EXTS(X(rsdc)[7..0]);
-
-
-
-
-
-

29.12.8. c.zext.h

-
-

Synopsis:

-
-
-

Zero extend halfword, 16-bit encoding

-
-
-

Mnemonic:

-
-
-

c.zext.h rd'/rs1'

-
-
-

Encoding (RV32, RV64):

-
-
-
-Diagram -
-
-
-

Description:

-
-
-

This instruction takes a single source/destination operand. -It zero-extends the least-significant halfword of the operand to XLEN bits by inserting zeros into all of -the bits more significant than 15.

-
-
- - - - - -
- - -
-

rd'/rs1' is from the standard 8-register set x8-x15.

-
-
-
-
-

Prerequisites:

-
-
-

Zbb is also required.

-
-
- - - - - -
- - -
-

The SAIL module variable for rd'/rs1' is called rsdc.

-
-
-
-
-

Operation:

-
-
-
-
X(rsdc) = EXTZ(X(rsdc)[15..0]);
-
-
-
-
-
-

29.12.9. c.sext.h

-
-

Synopsis:

-
-
-

Sign extend halfword, 16-bit encoding

-
-
-

Mnemonic:

-
-
-

c.sext.h rd'/rs1'

-
-
-

Encoding (RV32, RV64):

-
-
-
-Diagram -
-
-
-

Description:

-
-
-

This instruction takes a single source/destination operand. -It sign-extends the least-significant halfword in the operand to XLEN bits by copying the most-significant bit -in the halfword (i.e., bit 15) to all of the more-significant bits.

-
-
- - - - - -
- - -
-

rd'/rs1' is from the standard 8-register set x8-x15.

-
-
-
-
-

Prerequisites:

-
-
-

Zbb is also required.

-
-
- - - - - -
- - -
-

The SAIL module variable for rd'/rs1' is called rsdc.

-
-
-
-
-

Operation:

-
-
-
-
X(rsdc) = EXTS(X(rsdc)[15..0]);
-
-
-
-
-
-

29.12.10. c.zext.w

-
-

Synopsis:

-
-
-

Zero extend word, 16-bit encoding

-
-
-

Mnemonic:

-
-
-

c.zext.w rd'/rs1'

-
-
-

Encoding (RV64):

-
-
-
-Diagram -
-
-
-

Description:

-
-
-

This instruction takes a single source/destination operand. -It zero-extends the least-significant word of the operand to XLEN bits by inserting zeros into all of -the bits more significant than 31.

-
-
- - - - - -
- - -
-

rd'/rs1' is from the standard 8-register set x8-x15.

-
-
-
-
-

Prerequisites:

-
-
-

Zba is also required.

-
-
-

32-bit equivalent:

-
-
-
-
add.uw rd'/rs1', rd'/rs1', zero
-
-
-
- - - - - -
- - -
-

The SAIL module variable for rd'/rs1' is called rsdc.

-
-
-
-
-

Operation:

-
-
-
-
X(rsdc) = EXTZ(X(rsdc)[31..0]);
-
-
-
-
-
-

29.12.11. c.not

-
-

Synopsis:

-
-
-

Bitwise not, 16-bit encoding

-
-
-

Mnemonic:

-
-
-

c.not rd'/rs1'

-
-
-

Encoding (RV32, RV64):

-
-
-
-Diagram -
-
-
-

Description:

-
-
-

This instruction takes the one’s complement of rd'/rs1' and writes the result to the same register.

-
-
- - - - - -
- - -
-

rd'/rs1' is from the standard 8-register set x8-x15.

-
-
-
-
-

Prerequisites:

-
-
-

None

-
-
-

32-bit equivalent:

-
-
-
-
xori rd'/rs1', rd'/rs1', -1
-
-
-
- - - - - -
- - -
-

The SAIL module variable for rd'/rs1' is called rsdc.

-
-
-
-
-

Operation:

-
-
-
-
X(rsdc) = X(rsdc) XOR -1;
-
-
-
-
-
-

29.12.12. c.mul

-
-

Synopsis:

-
-
-

Multiply, 16-bit encoding

-
-
-

Mnemonic:

-
-
-

c.mul rsd', rs2'

-
-
-

Encoding (RV32, RV64):

-
-
-
-Diagram -
-
-
-

Description:

-
-
-

This instruction multiplies XLEN bits of the source operands from rsd' and rs2' and writes the lowest XLEN bits of the result to rsd'.

-
-
- - - - - -
- - -
-

rd'/rs1' and rs2' are from the standard 8-register set x8-x15.

-
-
-
-
-

Prerequisites:

-
-
-

M or Zmmul must be configured.

-
-
- - - - - -
- - -
-

The SAIL module variable for rd'/rs1' is called rsdc, and for rs2' is called rs2c.

-
-
-
-
-

Operation:

-
-
-
-
let result_wide = to_bits(2 * sizeof(xlen), signed(X(rsdc)) * signed(X(rs2c)));
-X(rsdc) = result_wide[(sizeof(xlen) - 1) .. 0];
-
-
-
-
-
-
-

29.13. PUSH/POP register instructions

-
-

These instructions are collectively referred to as PUSH/POP:

-
-
- -
-
-

The term PUSH refers to cm.push.

-
-
-

The term POP refers to cm.pop.

-
-
-

The term POPRET refers to cm.popret and cm.popretz.

-
-
-

Common details for these instructions are in this section.

-
-
-

29.13.1. PUSH/POP functional overview

-
-

PUSH, POP, POPRET are used to reduce the size of function prologues and epilogues.

-
-
-
    -
  1. -

    The PUSH instruction

    -
    -
      -
    • -

      adjusts the stack pointer to create the stack frame

      -
    • -
    • -

      pushes (stores) the registers specified in the register list to the stack frame

      -
    • -
    -
    -
  2. -
  3. -

    The POP instruction

    -
    -
      -
    • -

      pops (loads) the registers in the register list from the stack frame

      -
    • -
    • -

      adjusts the stack pointer to destroy the stack frame

      -
    • -
    -
    -
  4. -
  5. -

    The POPRET instructions

    -
    -
      -
    • -

      pop (load) the registers in the register list from the stack frame

      -
    • -
    • -

      cm.popretz also moves zero into a0 as the return value

      -
    • -
    • -

      adjust the stack pointer to destroy the stack frame

      -
    • -
    • -

      execute a ret instruction to return from the function

      -
    • -
    -
    -
  6. -
-
-
-
-
-

29.13.2. Example usage

-
-

This example gives an illustration of the use of PUSH and POPRET.

-
-
-

The function processMarkers in the EMBench benchmark picojpeg in the following file on github: libpicojpeg.c

-
-
-

The prologue and epilogue compile with GCC10 to:

-
-
-
-
   0001098a <processMarkers>:
-   1098a:       711d                    addi    sp,sp,-96 ;#cm.push(1)
-   1098c:       c8ca                    sw      s2,80(sp) ;#cm.push(2)
-   1098e:       c6ce                    sw      s3,76(sp) ;#cm.push(3)
-   10990:       c4d2                    sw      s4,72(sp) ;#cm.push(4)
-   10992:       ce86                    sw      ra,92(sp) ;#cm.push(5)
-   10994:       cca2                    sw      s0,88(sp) ;#cm.push(6)
-   10996:       caa6                    sw      s1,84(sp) ;#cm.push(7)
-   10998:       c2d6                    sw      s5,68(sp) ;#cm.push(8)
-   1099a:       c0da                    sw      s6,64(sp) ;#cm.push(9)
-   1099c:       de5e                    sw      s7,60(sp) ;#cm.push(10)
-   1099e:       dc62                    sw      s8,56(sp) ;#cm.push(11)
-   109a0:       da66                    sw      s9,52(sp) ;#cm.push(12)
-   109a2:       d86a                    sw      s10,48(sp);#cm.push(13)
-   109a4:       d66e                    sw      s11,44(sp);#cm.push(14)
-...
-   109f4:       4501                    li      a0,0      ;#cm.popretz(1)
-   109f6:       40f6                    lw      ra,92(sp) ;#cm.popretz(2)
-   109f8:       4466                    lw      s0,88(sp) ;#cm.popretz(3)
-   109fa:       44d6                    lw      s1,84(sp) ;#cm.popretz(4)
-   109fc:       4946                    lw      s2,80(sp) ;#cm.popretz(5)
-   109fe:       49b6                    lw      s3,76(sp) ;#cm.popretz(6)
-   10a00:       4a26                    lw      s4,72(sp) ;#cm.popretz(7)
-   10a02:       4a96                    lw      s5,68(sp) ;#cm.popretz(8)
-   10a04:       4b06                    lw      s6,64(sp) ;#cm.popretz(9)
-   10a06:       5bf2                    lw      s7,60(sp) ;#cm.popretz(10)
-   10a08:       5c62                    lw      s8,56(sp) ;#cm.popretz(11)
-   10a0a:       5cd2                    lw      s9,52(sp) ;#cm.popretz(12)
-   10a0c:       5d42                    lw      s10,48(sp);#cm.popretz(13)
-   10a0e:       5db2                    lw      s11,44(sp);#cm.popretz(14)
-   10a10:       6125                    addi    sp,sp,96  ;#cm.popretz(15)
-   10a12:       8082                    ret               ;#cm.popretz(16)
-
-
-
-
-

with the GCC option -msave-restore the output is the following:

-
-
-
-
0001080e <processMarkers>:
-   1080e:       73a012ef                jal     t0,11f48 <__riscv_save_12>
-   10812:       1101                    addi    sp,sp,-32
-...
-   10862:       4501                    li      a0,0
-   10864:       6105                    addi    sp,sp,32
-   10866:       71e0106f                j       11f84 <__riscv_restore_12>
-
-
-
-

with PUSH/POPRET this reduces to

-
-
-
-
0001080e <processMarkers>:
-   1080e:       b8fa                    cm.push    {ra,s0-s11},-96
-...
-   10866:       bcfa                    cm.popretz {ra,s0-s11}, 96
-
-
-
-

The prologue / epilogue reduce from 60-bytes in the original code, to 14-bytes with -msave-restore, -and to 4-bytes with PUSH and POPRET. -As well as reducing the code-size PUSH and POPRET eliminate the branches from -calling the millicode save/restore routines and so may also perform better.

-
-
- - - - - -
- - -
-

The calls to <riscv_save_0>/<riscv_restore_0> become 64-bit when the target functions are out of the ±1MB range, increasing the prologue/epilogue size to 22-bytes.

-
-
-
-
- - - - - -
- - -
-

POP is typically used in tail-calling sequences where ret is not used to return to ra after destroying the stack frame.

-
-
-
-
-
Stack pointer adjustment handling
-
-

The instructions all automatically adjust the stack pointer by enough to cover the memory required for the registers being saved or restored. -Additionally the spimm field in the encoding allows the stack pointer to be adjusted in additional increments of 16-bytes. There is only a small restricted -range available in the encoding; if the range is insufficient then a separate c.addi16sp can be used to increase the range.

-
-
-
-
Register list handling
-
-

There is no support for the {ra, s0-s10} register list without also adding s11. Therefore the {ra, s0-s11} register list must be used in this case.

-
-
-
-
-

29.13.3. PUSH/POP Fault handling

-
-

Correct execution requires that sp refers to idempotent memory (also see Non-idempotent memory handling), because the core must be able to -handle traps detected during the sequence. -The entire PUSH/POP sequence is re-executed after returning from the trap handler, and multiple traps are possible during the sequence.

-
-
-

If a trap occurs during the sequence then xEPC is updated with the PC of the instruction, xTVAL (if not read-only-zero) updated with the bad address if it was an access fault and xCAUSE updated with the type of trap.

-
-
- - - - - -
- - -It is implementation defined whether interrupts can also be taken during the sequence execution. -
-
-
-
-

29.13.4. Software view of execution

-
-
Software view of the PUSH sequence
-
-

From a software perspective the PUSH sequence appears as:

-
-
-
    -
  • -

    A sequence of stores writing the bytes required by the pseudocode

    -
    -
      -
    • -

      The bytes may be written in any order.

      -
    • -
    • -

      The bytes may be grouped into larger accesses.

      -
    • -
    • -

      Any of the bytes may be written multiple times.

      -
    • -
    -
    -
  • -
  • -

    A stack pointer adjustment

    -
  • -
-
-
- - - - - -
- - -
-

If an implementation allows interrupts during the sequence, and the interrupt handler uses sp to allocate stack memory, then any stores which were executed before the interrupt may be overwritten by the handler. This is safe because the memory is idempotent and the stores will be re-executed when execution resumes.

-
-
-
-
-

The stack pointer adjustment must only be committed only when it is certain that the entire PUSH instruction will commit.

-
-
-

Stores may also return imprecise faults from the bus. -It is platform defined whether the core implementation waits for the bus responses before continuing to the final stage of the sequence, -or handles errors responses after completing the PUSH instruction.

-
-
-
-

For example:

-
-
-
-
cm.push  {ra, s0-s5}, -64
-
-
-
-

Appears to software as:

-
-
-
-
# any bytes from sp-1 to sp-28 may be written multiple times before
-# the instruction completes therefore these updates may be visible in
-# the interrupt/exception handler below the stack pointer
-sw  s5, -4(sp)
-sw  s4, -8(sp)
-sw  s3,-12(sp)
-sw  s2,-16(sp)
-sw  s1,-20(sp)
-sw  s0,-24(sp)
-sw  ra,-28(sp)
-
-# this must only execute once, and will only execute after all stores
-# completed without any precise faults, therefore this update is only
-# visible in the interrupt/exception handler if cm.push has completed
-addi sp, sp, -64
-
-
-
-
-
Software view of the POP/POPRET sequence
-
-

From a software perspective the POP/POPRET sequence appears as:

-
-
-
    -
  • -

    A sequence of loads reading the bytes required by the pseudocode.

    -
    -
      -
    • -

      The bytes may be loaded in any order.

      -
    • -
    • -

      The bytes may be grouped into larger accesses.

      -
    • -
    • -

      Any of the bytes may be loaded multiple times.

      -
    • -
    -
    -
  • -
  • -

    A stack pointer adjustment

    -
  • -
  • -

    An optional li a0, 0

    -
  • -
  • -

    An optional ret

    -
  • -
-
-
-

If a trap occurs during the sequence, then any loads which were executed before the trap may update architectural state. -The loads will be re-executed once the trap handler completes, so the values will be overwritten. -Therefore it is permitted for an implementation to update some of the destination registers before taking a fault.

-
-
-

The optional li a0, 0, stack pointer adjustment and optional ret must only be committed only when it is certain that the entire POP/POPRET instruction will commit.

-
-
-

For POPRET once the stack pointer adjustment has been committed the ret must execute.

-
-
-
-

For example:

-
-
-
-
cm.popretz {ra, s0-s3}, 32;
-
-
-
-

Appears to software as:

-
-
-
-
# any or all of these load instructions may execute multiple times
-# therefore these updates may be visible in the interrupt/exception handler
-lw   s3, 28(sp)
-lw   s2, 24(sp)
-lw   s1, 20(sp)
-lw   s0, 16(sp)
-lw   ra, 12(sp)
-
-# these must only execute once, will only execute after all loads
-# complete successfully all instructions must execute atomically
-# therefore these updates are not visible in the interrupt/exception handler
-li a0, 0
-addi sp, sp, 32
-ret
-
-
-
-
-
-

29.13.5. Non-idempotent memory handling

-
-

An implementation may have a requirement to issue a PUSH/POP instruction to non-idempotent memory.

-
-
-

If the core implementation does not support PUSH/POP to non-idempotent memories, the core may use an idempotency PMA to detect it and take a -load (POP/POPRET) or store (PUSH) access fault exception in order to avoid unpredictable results.

-
-
-

Software should only use these instructions on non-idempotent memory regions when software can tolerate the required memory accesses -being issued repeatedly in the case that they cause exceptions.

-
-
-
-
-

29.13.6. Example RV32I PUSH/POP sequences

-
-

The examples are included show the load/store series expansion and the stack adjustment. -Examples of cm.popret and cm.popretz are not included, as the difference in the expanded sequence from cm.pop is trivial in all cases.

-
-
-
cm.push {ra, s0-s2}, -64
-
-

Encoding: rlist=7, spimm=3

-
-
-

expands to:

-
-
-
-
sw  s2,  -4(sp);
-sw  s1,  -8(sp);
-sw  s0, -12(sp);
-sw  ra, -16(sp);
-addi sp, sp, -64;
-
-
-
-
-
cm.push {ra, s0-s11}, -112
-
-

Encoding: rlist=15, spimm=3

-
-
-

expands to:

-
-
-
-
sw  s11,  -4(sp);
-sw  s10,  -8(sp);
-sw  s9,  -12(sp);
-sw  s8,  -16(sp);
-sw  s7,  -20(sp);
-sw  s6,  -24(sp);
-sw  s5,  -28(sp);
-sw  s4,  -32(sp);
-sw  s3,  -36(sp);
-sw  s2,  -40(sp);
-sw  s1,  -44(sp);
-sw  s0,  -48(sp);
-sw  ra,  -52(sp);
-addi sp, sp, -112;
-
-
-
-
-
-
cm.pop {ra}, 16
-
-

Encoding: rlist=4, spimm=0

-
-
-

expands to:

-
-
-
-
lw   ra, 12(sp);
-addi sp, sp, 16;
-
-
-
-
-
cm.pop {ra, s0-s3}, 48
-
-

Encoding: rlist=8, spimm=1

-
-
-

expands to:

-
-
-
-
lw   s3, 44(sp);
-lw   s2, 40(sp);
-lw   s1, 36(sp);
-lw   s0, 32(sp);
-lw   ra, 28(sp);
-addi sp, sp, 48;
-
-
-
-
-
cm.pop {ra, s0-s4}, 64
-
-

Encoding: rlist=9, spimm=2

-
-
-

expands to:

-
-
-
-
lw   s4, 60(sp);
-lw   s3, 56(sp);
-lw   s2, 52(sp);
-lw   s1, 48(sp);
-lw   s0, 44(sp);
-lw   ra, 40(sp);
-addi sp, sp, 64;
-
-
-
-
-
-
-

29.13.7. cm.push

-
-

Synopsis:

-
-
-

Create stack frame: store ra and 0 to 12 saved registers to the stack frame, optionally allocate additional stack space.

-
-
-

Mnemonic:

-
-
-

cm.push {reg_list}, -stack_adj

-
-
-

Encoding (RV32, RV64):

-
-
-
-Diagram -
-
-
- - - - - -
- - -
-

rlist values 0 to 3 are reserved for a future EABI variant called cm.push.e

-
-
-
-
-

Assembly Syntax:

-
-
-
-
cm.push {reg_list},  -stack_adj
-cm.push {xreg_list}, -stack_adj
-
-
-
-

The variables used in the assembly syntax are defined below.

-
-
-
-
RV32E:
-
-switch (rlist){
-  case  4: {reg_list="ra";         xreg_list="x1";}
-  case  5: {reg_list="ra, s0";     xreg_list="x1, x8";}
-  case  6: {reg_list="ra, s0-s1";  xreg_list="x1, x8-x9";}
-  default: reserved();
-}
-stack_adj      = stack_adj_base + spimm[5:4] * 16;
-
-
-
-
-
RV32I, RV64:
-
-switch (rlist){
-  case  4: {reg_list="ra";         xreg_list="x1";}
-  case  5: {reg_list="ra, s0";     xreg_list="x1, x8";}
-  case  6: {reg_list="ra, s0-s1";  xreg_list="x1, x8-x9";}
-  case  7: {reg_list="ra, s0-s2";  xreg_list="x1, x8-x9, x18";}
-  case  8: {reg_list="ra, s0-s3";  xreg_list="x1, x8-x9, x18-x19";}
-  case  9: {reg_list="ra, s0-s4";  xreg_list="x1, x8-x9, x18-x20";}
-  case 10: {reg_list="ra, s0-s5";  xreg_list="x1, x8-x9, x18-x21";}
-  case 11: {reg_list="ra, s0-s6";  xreg_list="x1, x8-x9, x18-x22";}
-  case 12: {reg_list="ra, s0-s7";  xreg_list="x1, x8-x9, x18-x23";}
-  case 13: {reg_list="ra, s0-s8";  xreg_list="x1, x8-x9, x18-x24";}
-  case 14: {reg_list="ra, s0-s9";  xreg_list="x1, x8-x9, x18-x25";}
-  //note - to include s10, s11 must also be included
-  case 15: {reg_list="ra, s0-s11"; xreg_list="x1, x8-x9, x18-x27";}
-  default: reserved();
-}
-stack_adj      = stack_adj_base + spimm[5:4] * 16;
-
-
-
-
-
RV32E:
-
-stack_adj_base = 16;
-Valid values:
-stack_adj      = [16|32|48|64];
-
-
-
-
-
RV32I:
-
-switch (rlist) {
-  case  4.. 7: stack_adj_base = 16;
-  case  8..11: stack_adj_base = 32;
-  case 12..14: stack_adj_base = 48;
-  case     15: stack_adj_base = 64;
-}
-
-Valid values:
-switch (rlist) {
-  case  4.. 7: stack_adj = [16|32|48| 64];
-  case  8..11: stack_adj = [32|48|64| 80];
-  case 12..14: stack_adj = [48|64|80| 96];
-  case     15: stack_adj = [64|80|96|112];
-}
-
-
-
-
-
RV64:
-
-switch (rlist) {
-  case  4.. 5: stack_adj_base =  16;
-  case  6.. 7: stack_adj_base =  32;
-  case  8.. 9: stack_adj_base =  48;
-  case 10..11: stack_adj_base =  64;
-  case 12..13: stack_adj_base =  80;
-  case     14: stack_adj_base =  96;
-  case     15: stack_adj_base = 112;
-}
-
-Valid values:
-switch (rlist) {
-  case  4.. 5: stack_adj = [ 16| 32| 48| 64];
-  case  6.. 7: stack_adj = [ 32| 48| 64| 80];
-  case  8.. 9: stack_adj = [ 48| 64| 80| 96];
-  case 10..11: stack_adj = [ 64| 80| 96|112];
-  case 12..13: stack_adj = [ 80| 96|112|128];
-  case     14: stack_adj = [ 96|112|128|144];
-  case     15: stack_adj = [112|128|144|160];
-}
-
-
-
-
-

Description:

-
-
-

This instruction pushes (stores) the registers in reg_list to the memory below the stack pointer, -and then creates the stack frame by decrementing the stack pointer by stack_adj, -including any additional stack space requested by the value of spimm.

-
-
- - - - - -
- - -
-

All ABI register mappings are for the UABI. An EABI version is planned once the EABI is frozen.

-
-
-
-
-

For further information see PUSH/POP Register Instructions.

-
-
-

Stack Adjustment Calculation:

-
-
-

stack_adj_base is the minimum number of bytes, in multiples of 16-byte address increments, required to cover the registers in the list.

-
-
-

spimm is the number of additional 16-byte address increments allocated for the stack frame.

-
-
-

The total stack adjustment represents the total size of the stack frame, which is stack_adj_base added to spimm scaled by 16, -as defined above.

-
-
-

Prerequisites:

-
-
-

None

-
-
-

32-bit equivalent:

-
-
-

No direct equivalent encoding exists

-
-
-

Operation:

-
-
-

The first section of pseudocode may be executed multiple times before the instruction successfully completes.

-
-
-
-
//This is not SAIL, it's pseudocode. The SAIL hasn't been written yet.
-
-if (XLEN==32) bytes=4; else bytes=8;
-
-addr=sp-bytes;
-for(i in 27,26,25,24,23,22,21,20,19,18,9,8,1)  {
-  //if register i is in xreg_list
-  if (xreg_list[i]) {
-    switch(bytes) {
-      4:  asm("sw x[i], 0(addr)");
-      8:  asm("sd x[i], 0(addr)");
-    }
-    addr-=bytes;
-  }
-}
-
-
-
-

The final section of pseudocode executes atomically, and only executes if the section above completes without any exceptions or interrupts.

-
-
-
-
//This is not SAIL, it's pseudocode. The SAIL hasn't been written yet.
-
-sp-=stack_adj;
-
-
-
-
-
-

29.13.8. cm.pop

-
-

Synopsis:

-
-
-

Destroy stack frame: load ra and 0 to 12 saved registers from the stack frame, deallocate the stack frame.

-
-
-

Mnemonic:

-
-
-

cm.pop {reg_list}, stack_adj

-
-
-

Encoding (RV32, RV64):

-
-
-
-Diagram -
-
-
- - - - - -
- - -
-

rlist values 0 to 3 are reserved for a future EABI variant called cm.pop.e

-
-
-
-
-

Assembly Syntax:

-
-
-
-
cm.pop {reg_list},  stack_adj
-cm.pop {xreg_list}, stack_adj
-
-
-
-

The variables used in the assembly syntax are defined below.

-
-
-
-
RV32E:
-
-switch (rlist){
-  case  4: {reg_list="ra";         xreg_list="x1";}
-  case  5: {reg_list="ra, s0";     xreg_list="x1, x8";}
-  case  6: {reg_list="ra, s0-s1";  xreg_list="x1, x8-x9";}
-  default: reserved();
-}
-stack_adj      = stack_adj_base + spimm[5:4] * 16;
-
-
-
-
-
RV32I, RV64:
-
-switch (rlist){
-  case  4: {reg_list="ra";         xreg_list="x1";}
-  case  5: {reg_list="ra, s0";     xreg_list="x1, x8";}
-  case  6: {reg_list="ra, s0-s1";  xreg_list="x1, x8-x9";}
-  case  7: {reg_list="ra, s0-s2";  xreg_list="x1, x8-x9, x18";}
-  case  8: {reg_list="ra, s0-s3";  xreg_list="x1, x8-x9, x18-x19";}
-  case  9: {reg_list="ra, s0-s4";  xreg_list="x1, x8-x9, x18-x20";}
-  case 10: {reg_list="ra, s0-s5";  xreg_list="x1, x8-x9, x18-x21";}
-  case 11: {reg_list="ra, s0-s6";  xreg_list="x1, x8-x9, x18-x22";}
-  case 12: {reg_list="ra, s0-s7";  xreg_list="x1, x8-x9, x18-x23";}
-  case 13: {reg_list="ra, s0-s8";  xreg_list="x1, x8-x9, x18-x24";}
-  case 14: {reg_list="ra, s0-s9";  xreg_list="x1, x8-x9, x18-x25";}
-  //note - to include s10, s11 must also be included
-  case 15: {reg_list="ra, s0-s11"; xreg_list="x1, x8-x9, x18-x27";}
-  default: reserved();
-}
-stack_adj      = stack_adj_base + spimm[5:4] * 16;
-
-
-
-
-
RV32E:
-
-stack_adj_base = 16;
-Valid values:
-stack_adj      = [16|32|48|64];
-
-
-
-
-
RV32I:
-
-switch (rlist) {
-  case  4.. 7: stack_adj_base = 16;
-  case  8..11: stack_adj_base = 32;
-  case 12..14: stack_adj_base = 48;
-  case     15: stack_adj_base = 64;
-}
-
-Valid values:
-switch (rlist) {
-  case  4.. 7: stack_adj = [16|32|48| 64];
-  case  8..11: stack_adj = [32|48|64| 80];
-  case 12..14: stack_adj = [48|64|80| 96];
-  case     15: stack_adj = [64|80|96|112];
-}
-
-
-
-
-
RV64:
-
-switch (rlist) {
-  case  4.. 5: stack_adj_base =  16;
-  case  6.. 7: stack_adj_base =  32;
-  case  8.. 9: stack_adj_base =  48;
-  case 10..11: stack_adj_base =  64;
-  case 12..13: stack_adj_base =  80;
-  case     14: stack_adj_base =  96;
-  case     15: stack_adj_base = 112;
-}
-
-Valid values:
-switch (rlist) {
-  case  4.. 5: stack_adj = [ 16| 32| 48| 64];
-  case  6.. 7: stack_adj = [ 32| 48| 64| 80];
-  case  8.. 9: stack_adj = [ 48| 64| 80| 96];
-  case 10..11: stack_adj = [ 64| 80| 96|112];
-  case 12..13: stack_adj = [ 80| 96|112|128];
-  case     14: stack_adj = [ 96|112|128|144];
-  case     15: stack_adj = [112|128|144|160];
-}
-
-
-
-
-

Description:

-
-
-

This instruction pops (loads) the registers in reg_list from stack memory, -and then adjusts the stack pointer by stack_adj.

-
-
- - - - - -
- - -
-

All ABI register mappings are for the UABI. An EABI version is planned once the EABI is frozen.

-
-
-
-
-

For further information see PUSH/POP Register Instructions.

-
-
-

Stack Adjustment Calculation:

-
-
-

stack_adj_base is the minimum number of bytes, in multiples of 16-byte address increments, required to cover the registers in the list.

-
-
-

spimm is the number of additional 16-byte address increments allocated for the stack frame.

-
-
-

The total stack adjustment represents the total size of the stack frame, which is stack_adj_base added to spimm scaled by 16, -as defined above.

-
-
-

Prerequisites:

-
-
-

None

-
-
-

32-bit equivalent:

-
-
-

No direct equivalent encoding exists

-
-
-

Operation:

-
-
-

The first section of pseudocode may be executed multiple times before the instruction successfully completes.

-
-
-
-
//This is not SAIL, it's pseudocode. The SAIL hasn't been written yet.
-
-if (XLEN==32) bytes=4; else bytes=8;
-
-addr=sp+stack_adj-bytes;
-for(i in 27,26,25,24,23,22,21,20,19,18,9,8,1)  {
-  //if register i is in xreg_list
-  if (xreg_list[i]) {
-    switch(bytes) {
-      4:  asm("lw x[i], 0(addr)");
-      8:  asm("ld x[i], 0(addr)");
-    }
-    addr-=bytes;
-  }
-}
-
-
-
-

The final section of pseudocode executes atomically, and only executes if the section above completes without any exceptions or interrupts.

-
-
-
-
//This is not SAIL, it's pseudocode. The SAIL hasn't been written yet.
-
-sp+=stack_adj;
-
-
-
-
-
-

29.13.9. cm.popretz

-
-

Synopsis:

-
-
-

Destroy stack frame: load ra and 0 to 12 saved registers from the stack frame, deallocate the stack frame, move zero into a0, return to ra.

-
-
-

Mnemonic:

-
-
-

cm.popretz {reg_list}, stack_adj

-
-
-

Encoding (RV32, RV64):

-
-
-
-Diagram -
-
-
- - - - - -
- - -
-

rlist values 0 to 3 are reserved for a future EABI variant called cm.popretz.e

-
-
-
-
-

Assembly Syntax:

-
-
-
-
cm.popretz {reg_list},  stack_adj
-cm.popretz {xreg_list}, stack_adj
-
-
-
-
-
RV32E:
-
-switch (rlist){
-  case  4: {reg_list="ra";         xreg_list="x1";}
-  case  5: {reg_list="ra, s0";     xreg_list="x1, x8";}
-  case  6: {reg_list="ra, s0-s1";  xreg_list="x1, x8-x9";}
-  default: reserved();
-}
-stack_adj      = stack_adj_base + spimm[5:4] * 16;
-
-
-
-
-
RV32I, RV64:
-
-switch (rlist){
-  case  4: {reg_list="ra";         xreg_list="x1";}
-  case  5: {reg_list="ra, s0";     xreg_list="x1, x8";}
-  case  6: {reg_list="ra, s0-s1";  xreg_list="x1, x8-x9";}
-  case  7: {reg_list="ra, s0-s2";  xreg_list="x1, x8-x9, x18";}
-  case  8: {reg_list="ra, s0-s3";  xreg_list="x1, x8-x9, x18-x19";}
-  case  9: {reg_list="ra, s0-s4";  xreg_list="x1, x8-x9, x18-x20";}
-  case 10: {reg_list="ra, s0-s5";  xreg_list="x1, x8-x9, x18-x21";}
-  case 11: {reg_list="ra, s0-s6";  xreg_list="x1, x8-x9, x18-x22";}
-  case 12: {reg_list="ra, s0-s7";  xreg_list="x1, x8-x9, x18-x23";}
-  case 13: {reg_list="ra, s0-s8";  xreg_list="x1, x8-x9, x18-x24";}
-  case 14: {reg_list="ra, s0-s9";  xreg_list="x1, x8-x9, x18-x25";}
-  //note - to include s10, s11 must also be included
-  case 15: {reg_list="ra, s0-s11"; xreg_list="x1, x8-x9, x18-x27";}
-  default: reserved();
-}
-stack_adj      = stack_adj_base + spimm[5:4] * 16;
-
-
-
-
-
RV32E:
-
-stack_adj_base = 16;
-Valid values:
-stack_adj      = [16|32|48|64];
-
-
-
-
-
RV32I:
-
-switch (rlist) {
-  case  4.. 7: stack_adj_base = 16;
-  case  8..11: stack_adj_base = 32;
-  case 12..14: stack_adj_base = 48;
-  case     15: stack_adj_base = 64;
-}
-
-Valid values:
-switch (rlist) {
-  case  4.. 7: stack_adj = [16|32|48| 64];
-  case  8..11: stack_adj = [32|48|64| 80];
-  case 12..14: stack_adj = [48|64|80| 96];
-  case     15: stack_adj = [64|80|96|112];
-}
-
-
-
-
-
RV64:
-
-switch (rlist) {
-  case  4.. 5: stack_adj_base =  16;
-  case  6.. 7: stack_adj_base =  32;
-  case  8.. 9: stack_adj_base =  48;
-  case 10..11: stack_adj_base =  64;
-  case 12..13: stack_adj_base =  80;
-  case     14: stack_adj_base =  96;
-  case     15: stack_adj_base = 112;
-}
-
-Valid values:
-switch (rlist) {
-  case  4.. 5: stack_adj = [ 16| 32| 48| 64];
-  case  6.. 7: stack_adj = [ 32| 48| 64| 80];
-  case  8.. 9: stack_adj = [ 48| 64| 80| 96];
-  case 10..11: stack_adj = [ 64| 80| 96|112];
-  case 12..13: stack_adj = [ 80| 96|112|128];
-  case     14: stack_adj = [ 96|112|128|144];
-  case     15: stack_adj = [112|128|144|160];
-}
-
-
-
-
-

Description:

-
-
-

This instruction pops (loads) the registers in reg_list from stack memory, adjusts the stack pointer by stack_adj, moves zero into a0 and then returns to ra.

-
-
- - - - - -
- - -
-

All ABI register mappings are for the UABI. An EABI version is planned once the EABI is frozen.

-
-
-
-
-

For further information see PUSH/POP Register Instructions.

-
-
-

Stack Adjustment Calculation:

-
-
-

stack_adj_base is the minimum number of bytes, in multiples of 16-byte address increments, required to cover the registers in the list.

-
-
-

spimm is the number of additional 16-byte address increments allocated for the stack frame.

-
-
-

The total stack adjustment represents the total size of the stack frame, which is stack_adj_base added to spimm scaled by 16, as defined above.

-
-
-

Prerequisites:

-
-
-

None

-
-
-

32-bit equivalent:

-
-
-

No direct equivalent encoding exists

-
-
-

Operation:

-
-
-

The first section of pseudocode may be executed multiple times before the instruction successfully completes.

-
-
-
-
//This is not SAIL, it's pseudocode. The SAIL hasn't been written yet.
-
-if (XLEN==32) bytes=4; else bytes=8;
-
-addr=sp+stack_adj-bytes;
-for(i in 27,26,25,24,23,22,21,20,19,18,9,8,1)  {
-  //if register i is in xreg_list
-  if (xreg_list[i]) {
-    switch(bytes) {
-      4:  asm("lw x[i], 0(addr)");
-      8:  asm("ld x[i], 0(addr)");
-    }
-    addr-=bytes;
-  }
-}
-
-
-
-

The final section of pseudocode executes atomically, and only executes if the section above completes without any exceptions or interrupts.

-
-
- - - - - -
- - -
-

The li a0, 0 could be executed more than once, but is included in the atomic section for convenience.

-
-
-
-
-
-
//This is not SAIL, it's pseudocode. The SAIL hasn't been written yet.
-
-asm("li a0, 0");
-sp+=stack_adj;
-asm("ret");
-
-
-
-
-
-

29.13.10. cm.popret

-
-

Synopsis:

-
-
-

Destroy stack frame: load ra and 0 to 12 saved registers from the stack frame, deallocate the stack frame, return to ra.

-
-
-

Mnemonic:

-
-
-

cm.popret {reg_list}, stack_adj

-
-
-

Encoding (RV32, RV64):

-
-
-
-Diagram -
-
-
- - - - - -
- - -
-

rlist values 0 to 3 are reserved for a future EABI variant called cm.popret.e

-
-
-
-
-

Assembly Syntax:

-
-
-
-
cm.popret {reg_list},  stack_adj
-cm.popret {xreg_list}, stack_adj
-
-
-
-

The variables used in the assembly syntax are defined below.

-
-
-
-
RV32E:
-
-switch (rlist){
-  case  4: {reg_list="ra";         xreg_list="x1";}
-  case  5: {reg_list="ra, s0";     xreg_list="x1, x8";}
-  case  6: {reg_list="ra, s0-s1";  xreg_list="x1, x8-x9";}
-  default: reserved();
-}
-stack_adj      = stack_adj_base + spimm[5:4] * 16;
-
-
-
-
-
RV32I, RV64:
-
-switch (rlist){
-  case  4: {reg_list="ra";         xreg_list="x1";}
-  case  5: {reg_list="ra, s0";     xreg_list="x1, x8";}
-  case  6: {reg_list="ra, s0-s1";  xreg_list="x1, x8-x9";}
-  case  7: {reg_list="ra, s0-s2";  xreg_list="x1, x8-x9, x18";}
-  case  8: {reg_list="ra, s0-s3";  xreg_list="x1, x8-x9, x18-x19";}
-  case  9: {reg_list="ra, s0-s4";  xreg_list="x1, x8-x9, x18-x20";}
-  case 10: {reg_list="ra, s0-s5";  xreg_list="x1, x8-x9, x18-x21";}
-  case 11: {reg_list="ra, s0-s6";  xreg_list="x1, x8-x9, x18-x22";}
-  case 12: {reg_list="ra, s0-s7";  xreg_list="x1, x8-x9, x18-x23";}
-  case 13: {reg_list="ra, s0-s8";  xreg_list="x1, x8-x9, x18-x24";}
-  case 14: {reg_list="ra, s0-s9";  xreg_list="x1, x8-x9, x18-x25";}
-  //note - to include s10, s11 must also be included
-  case 15: {reg_list="ra, s0-s11"; xreg_list="x1, x8-x9, x18-x27";}
-  default: reserved();
-}
-stack_adj      = stack_adj_base + spimm[5:4] * 16;
-
-
-
-
-
RV32E:
-
-stack_adj_base = 16;
-Valid values:
-stack_adj      = [16|32|48|64];
-
-
-
-
-
RV32I:
-
-switch (rlist) {
-  case  4.. 7: stack_adj_base = 16;
-  case  8..11: stack_adj_base = 32;
-  case 12..14: stack_adj_base = 48;
-  case     15: stack_adj_base = 64;
-}
-
-Valid values:
-switch (rlist) {
-  case  4.. 7: stack_adj = [16|32|48| 64];
-  case  8..11: stack_adj = [32|48|64| 80];
-  case 12..14: stack_adj = [48|64|80| 96];
-  case     15: stack_adj = [64|80|96|112];
-}
-
-
-
-
-
RV64:
-
-switch (rlist) {
-  case  4.. 5: stack_adj_base =  16;
-  case  6.. 7: stack_adj_base =  32;
-  case  8.. 9: stack_adj_base =  48;
-  case 10..11: stack_adj_base =  64;
-  case 12..13: stack_adj_base =  80;
-  case     14: stack_adj_base =  96;
-  case     15: stack_adj_base = 112;
-}
-
-Valid values:
-switch (rlist) {
-  case  4.. 5: stack_adj = [ 16| 32| 48| 64];
-  case  6.. 7: stack_adj = [ 32| 48| 64| 80];
-  case  8.. 9: stack_adj = [ 48| 64| 80| 96];
-  case 10..11: stack_adj = [ 64| 80| 96|112];
-  case 12..13: stack_adj = [ 80| 96|112|128];
-  case     14: stack_adj = [ 96|112|128|144];
-  case     15: stack_adj = [112|128|144|160];
-}
-
-
-
-
-

Description:

-
-
-

This instruction pops (loads) the registers in reg_list from stack memory, adjusts the stack pointer by stack_adj and then returns to ra.

-
-
- - - - - -
- - -
-

All ABI register mappings are for the UABI. An EABI version is planned once the EABI is frozen.

-
-
-
-
-

For further information see PUSH/POP Register Instructions.

-
-
-

Stack Adjustment Calculation:

-
-
-

stack_adj_base is the minimum number of bytes, in multiples of 16-byte address increments, required to cover the registers in the list.

-
-
-

spimm is the number of additional 16-byte address increments allocated for the stack frame.

-
-
-

The total stack adjustment represents the total size of the stack frame, which is stack_adj_base added to spimm scaled by 16, as defined above.

-
-
-

Prerequisites:

-
-
-

None

-
-
-

32-bit equivalent:

-
-
-

No direct equivalent encoding exists

-
-
-

Operation:

-
-
-

The first section of pseudocode may be executed multiple times before the instruction successfully completes.

-
-
-
-
//This is not SAIL, it's pseudocode. The SAIL hasn't been written yet.
-
-if (XLEN==32) bytes=4; else bytes=8;
-
-addr=sp+stack_adj-bytes;
-for(i in 27,26,25,24,23,22,21,20,19,18,9,8,1)  {
-  //if register i is in xreg_list
-  if (xreg_list[i]) {
-    switch(bytes) {
-      4:  asm("lw x[i], 0(addr)");
-      8:  asm("ld x[i], 0(addr)");
-    }
-    addr-=bytes;
-  }
-}
-
-
-
-

The final section of pseudocode executes atomically, and only executes if the section above completes without any exceptions or interrupts.

-
-
-
-
//This is not SAIL, it's pseudocode. The SAIL hasn't been written yet.
-
-sp+=stack_adj;
-asm("ret");
-
-
-
-
-
-

29.13.11. cm.mvsa01

-
-

Synopsis:

-
-
-

Move a0-a1 into two registers of s0-s7

-
-
-

Mnemonic:

-
-
-

cm.mvsa01 r1s', r2s'

-
-
-

Encoding (RV32, RV64):

-
-
-
-Diagram -
-
-
- - - - - -
- - -
-

For the encoding to be legal r1s' != r2s'.

-
-
-
-
-

Assembly Syntax:

-
-
-
-
cm.mvsa01 r1s', r2s'
-
-
-
-

Description: -This instruction moves a0 into r1s' and a1 into r2s'. r1s' and r2s' must be different. -The execution is atomic, so it is not possible to observe state where only one of r1s' or r2s' has been updated.

-
-
-

The encoding uses sreg number specifiers instead of xreg number specifiers to save encoding space. -The mapping between them is specified in the pseudocode below.

-
-
- - - - - -
- - -
-

The s register mapping is taken from the UABI, and may not match the currently unratified EABI. cm.mvsa01.e may be included in the future.

-
-
-
-
-

Prerequisites:

-
-
-

None

-
-
-

32-bit equivalent:

-
-
-

No direct equivalent encoding exists.

-
-
-

Operation:

-
-
-
-
//This is not SAIL, it's pseudocode. The SAIL hasn't been written yet.
-if (RV32E && (r1sc>1 || r2sc>1)) {
-  reserved();
-}
-xreg1 = {r1sc[2:1]>0,r1sc[2:1]==0,r1sc[2:0]};
-xreg2 = {r2sc[2:1]>0,r2sc[2:1]==0,r2sc[2:0]};
-X[xreg1] = X[10];
-X[xreg2] = X[11];
-
-
-
-
-
-

29.13.12. cm.mva01s

-
-

Synopsis:

-
-
-

Move two s0-s7 registers into a0-a1

-
-
-

Mnemonic:

-
-
-

cm.mva01s r1s', r2s'

-
-
-

Encoding (RV32, RV64):

-
-
-
-Diagram -
-
-
-

Assembly Syntax:

-
-
-
-
cm.mva01s r1s', r2s'
-
-
-
-

Description: -This instruction moves r1s' into a0 and r2s' into a1. -The execution is atomic, so it is not possible to observe state where only one of a0 or a1 have been updated.

-
-
-

The encoding uses sreg number specifiers instead of xreg number specifiers to save encoding space. -The mapping between them is specified in the pseudocode below.

-
-
- - - - - -
- - -
-

The s register mapping is taken from the UABI, and may not match the currently unratified EABI. cm.mva01s.e may be included in the future.

-
-
-
-
-

Prerequisites:

-
-
-

None

-
-
-

32-bit equivalent:

-
-
-

No direct equivalent encoding exists.

-
-
-

Operation:

-
-
-
-
//This is not SAIL, it's pseudocode. The SAIL hasn't been written yet.
-if (RV32E && (r1sc>1 || r2sc>1)) {
-  reserved();
-}
-xreg1 = {r1sc[2:1]>0,r1sc[2:1]==0,r1sc[2:0]};
-xreg2 = {r2sc[2:1]>0,r2sc[2:1]==0,r2sc[2:0]};
-X[10] = X[xreg1];
-X[11] = X[xreg2];
-
-
-
-
-
-
-

29.14. Table Jump Overview

-
-

cm.jt (Jump via table) and cm.jalt (Jump and link via table) are referred to as table jump.

-
-
-

Table jump uses a 256-entry XLEN wide table in instruction memory to contain function addresses. -The table must be a minimum of 64-byte aligned.

-
-
-

Table entries follow the current data endianness. This is different from normal instruction fetch which is always little-endian.

-
-
-

cm.jt and cm.jalt encodings index the table, giving access to functions within the full XLEN wide address space.

-
-
-

This is used as a form of dictionary compression to reduce the code size of jal / auipc+jalr / jr / auipc+jr instructions.

-
-
-

Table jump allows the linker to replace the following instruction sequences with a cm.jt or cm.jalt encoding, and an entry in the table:

-
-
-
    -
  • -

    32-bit j calls

    -
  • -
  • -

    32-bit jal ra calls

    -
  • -
  • -

    64-bit auipc+jr calls to fixed locations

    -
  • -
  • -

    64-bit auipc+jalr ra calls to fixed locations

    -
    -
      -
    • -

      The auipc+jr/jalr sequence is used because the offset from the PC is out of the ±1MB range.

      -
    • -
    -
    -
  • -
-
-
-

If a return address stack is implemented, then as cm.jalt is equivalent to jal ra, it pushes to the stack.

-
-
-

29.14.1. jvt

-
-

The base of the table is in the jvt CSR (see jvt CSR, table jump base vector and control register), each table entry is XLEN bits.

-
-
-

If the same function is called with and without linking then it must have two entries in the table. -This is typically caused by the same function being called with and without tail calling.

-
-
-
-

29.14.2. Table Jump Fault handling

-
-

For a table jump instruction, the table entry that the instruction selects is considered an extension of the instruction itself. -Hence, the execution of a table jump instruction involves two instruction fetches, the first to read the instruction (cm.jt/cm.jalt) -and the second to read from the jump vector table (JVT). Both instruction fetches are implicit reads, and both require -execute permission; read permission is irrelevant. It is recommended that the second fetch be ignored for hardware triggers and breakpoints.

-
-
-

Memory writes to the jump vector table require an instruction barrier (fence.i) to guarantee that they are visible to the instruction fetch.

-
-
-

Multiple contexts may have different jump vector tables. JVT may be switched between them without an instruction barrier -if the tables have not been updated in memory since the last fence.i.

-
-
-

If an exception occurs on either instruction fetch, xEPC is set to the PC of the table jump instruction, xCAUSE is set as expected for the type of fault and xTVAL (if not set to zero) contains the fetch address which caused the fault.

-
-
-
-
-

29.14.3. jvt CSR

-
-

Synopsis:

-
-
-

Table jump base vector and control register

-
-
-

Address:

-
-
-

0x0017

-
-
-

Permissions:

-
-
-

URW

-
-
-

Format (RV32):

-
-
-
-Diagram -
-
-
-

Format (RV64):

-
-
-
-Diagram -
-
-
-

Description:

-
-
-

The jvt register is an XLEN-bit WARL read/write register that holds the jump table configuration, consisting of the jump table base address (BASE) and the jump table mode (MODE).

-
-
-

If Section 29.10 is implemented then jvt must also be implemented, but can contain a read-only value. If jvt is writable, the set of values the register may hold can vary by implementation. The value in the BASE field must always be aligned on a 64-byte boundary.

-
-
-

jvt.base is a virtual address, whenever virtual memory is enabled.

-
-
-

The memory pointed to by jvt.base is treated as instruction memory for the purpose of executing table jump instructions, implying execute access permission.

-
- - ---- - - - - - - - - - - - - - - - - -
Table 26. jvt.mode definition
jvt.modeComment

000000

Jump table mode

others

reserved for future standard use

-
-

jvt.mode is a WARL field, so can only be programmed to modes which are implemented. Therefore the discovery mechanism is to -attempt to program different modes and read back the values to see which are available. Jump table mode must be implemented.

-
-
- - - - - -
- - -
-

in future the RISC-V Unified Discovery method will report the available modes.

-
-
-
-
-

Architectural State:

-
-
-

jvt CSR adds architectural state to the system software context (such as an OS process), therefore must be saved/restored on context switches.

-
-
-

State Enable:

-
-
-

If the Smstateen extension is implemented, then bit 2 in mstateen0, sstateen0, and hstateen0 is implemented. If bit 2 of a controlling stateen0 CSR is zero, then access to the jvt CSR and execution of a cm.jalt or cm.jt instruction by a lower privilege level results in an Illegal Instruction trap (or, if appropriate, a Virtual Instruction trap).

-
-
-
-
-

29.14.4. cm.jt

-
-

Synopsis:

-
-
-

jump via table

-
-
-

Mnemonic:

-
-
-

cm.jt index

-
-
-

Encoding (RV32, RV64):

-
-
-
-Diagram -
-
-
- - - - - -
- - -
-

For this encoding to decode as cm.jt, index<32, otherwise it decodes as cm.jalt, see Jump and link via table.

-
-
-
-
- - - - - -
- - -
-

If jvt.mode = 0 (Jump Table Mode) then cm.jt behaves as specified here. If jvt.mode is a reserved value, then cm.jt is also reserved. In the future other defined values of jvt.mode may change the behaviour of cm.jt.

-
-
-
-
-

Assembly Syntax:

-
-
-
-
cm.jt index
-
-
-
-

Description:

-
-
-

cm.jt reads an entry from the jump vector table in memory and jumps to the address that was read.

-
-
-

For further information see Table Jump Overview.

-
-
-

Prerequisites:

-
-
-

None

-
-
-

32-bit equivalent:

-
-
-

No direct equivalent encoding exists.

-
-
-
-

Operation:

-
-
-
-
//This is not SAIL, it's pseudocode. The SAIL hasn't been written yet.
-
-# target_address is temporary internal state, it doesn't represent a real register
-# InstMemory is byte indexed
-
-switch(XLEN) {
-  32:  table_address[XLEN-1:0] = jvt.base + (index<<2);
-  64:  table_address[XLEN-1:0] = jvt.base + (index<<3);
-}
-
-//fetch from the jump table
-target_address[XLEN-1:0] = InstMemory[table_address][XLEN-1:0];
-
-j target_address[XLEN-1:0]&~0x1;
-
-
-
-
-
-

29.14.5. cm.jalt

-
-

Synopsis:

-
-
-

jump via table with optional link

-
-
-

Mnemonic:

-
-
-

cm.jalt index

-
-
-

Encoding (RV32, RV64):

-
-
-
-Diagram -
-
-
- - - - - -
- - -
-

For this encoding to decode as cm.jalt, index>=32, otherwise it decodes as cm.jt, see Jump via table.

-
-
-
-
- - - - - -
- - -
-

If jvt.mode = 0 (Jump Table Mode) then cm.jalt behaves as specified here. If jvt.mode is a reserved value, then cm.jalt is also reserved. In the future other defined values of jvt.mode may change the behaviour of cm.jalt.

-
-
-
-
-

Assembly Syntax:

-
-
-
-
cm.jalt index
-
-
-
-

Description:

-
-
-

cm.jalt reads an entry from the jump vector table in memory and jumps to the address that was read, linking to ra.

-
-
-

For further information see Table Jump Overview.

-
-
-

Prerequisites:

-
-
-

None

-
-
-

32-bit equivalent:

-
-
-

No direct equivalent encoding exists.

-
-
-
-

Operation:

-
-
-
-
//This is not SAIL, it's pseudocode. The SAIL hasn't been written yet.
-
-# target_address is temporary internal state, it doesn't represent a real register
-# InstMemory is byte indexed
-
-switch(XLEN) {
-  32:  table_address[XLEN-1:0] = jvt.base + (index<<2);
-  64:  table_address[XLEN-1:0] = jvt.base + (index<<3);
-}
-
-//fetch from the jump table
-target_address[XLEN-1:0] = InstMemory[table_address][XLEN-1:0];
-
-jal ra, target_address[XLEN-1:0]&~0x1;
-
-
-
-
-
-
-
-

30. "B" Extension for Bit Manipulation, Version 1.0.0

-
-
-

The B standard extension comprises instructions provided by the Zba, Zbb, and -Zbs extensions.

-
-
-

30.1. Zb* Overview

-
-

The bit-manipulation (bitmanip) extension collection is comprised of several component extensions to the base RISC-V architecture that are intended to provide some combination of code size reduction, performance improvement, and energy reduction. -While the instructions are intended to have general use, some instructions are more useful in some domains than others. -Hence, several smaller bitmanip extensions are provided. Each of these smaller extensions is grouped by common function and use case, and each has its own Zb*-extension name.

-
-
-

Each bitmanip extension includes a group of several bitmanip instructions that have similar purposes and that can often share the same logic. Some instructions are available in only one extension while others are available in several. -The instructions have mnemonics and encodings that are independent of the extensions in which they appear. -Thus, when implementing extensions with overlapping instructions, there is no redundancy in logic or encoding.

-
-
-

The bitmanip extensions are defined for RV32 and RV64. -Most of the instructions are expected to be forward compatible with RV128. -While the shift-immediate instructions are defined to have at most a 6-bit immediate field, a 7th bit is available in the encoding space should this be needed for RV128.

-
-
-
-

30.2. Word Instructions

-
-

The bitmanip extension follows the convention in RV64 that w-suffixed instructions (without a dot before the w) ignore the upper 32 bits of their inputs, operate on the least-significant 32-bits as signed values and produce a 32-bit signed result that is sign-extended to XLEN.

-
-
-

Bitmanip instructions with the suffix .uw have one operand that is an unsigned 32-bit value that is extracted from the least significant 32 bits of the specified register. Other than that, these perform full XLEN operations.

-
-
-

Bitmanip instructions with the suffix .b, .h and .w only look at the least significant 8-bits, 16-bits and 32-bits of the input (respectively) and produce an XLEN-wide result that is sign-extended or zero-extended, based on the specific instruction.

-
-
-
-

30.3. Pseudocode for instruction semantics

-
-

The semantics of each instruction in Instructions (in alphabetical order) is expressed in a SAIL-like syntax.

-
-
-
-

30.4. Extensions

-
-

The first group of bitmanip extensions to be released for Public Review are:

-
- -
-

Below is a list of all of the instructions that are included in these extensions -along with their specific mapping:

-
- ---------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
RV32RV64MnemonicInstructionZbaZbbZbcZbs

add.uw rd, rs1, rs2

Add unsigned word

andn rd, rs1, rs2

AND with inverted operand

clmul rd, rs1, rs2

Carry-less multiply (low-part)

clmulh rd, rs1, rs2

Carry-less multiply (high-part)

clmulr rd, rs1, rs2

Carry-less multiply (reversed)

clz rd, rs

Count leading zero bits

clzw rd, rs

Count leading zero bits in word

cpop rd, rs

Count set bits

cpopw rd, rs

Count set bits in word

ctz rd, rs

Count trailing zero bits

ctzw rd, rs

Count trailing zero bits in word

max rd, rs1, rs2

Maximum

maxu rd, rs1, rs2

Unsigned maximum

min rd, rs1, rs2

Minimum

minu rd, rs1, rs2

Unsigned minimum

orc.b rd, rs1, rs2

Bitwise OR-Combine, byte granule

orn rd, rs1, rs2

OR with inverted operand

rev8 rd, rs

Byte-reverse register

rol rd, rs1, rs2

Rotate left (Register)

rolw rd, rs1, rs2

Rotate Left Word (Register)

ror rd, rs1, rs2

Rotate right (Register)

rori rd, rs1, shamt

Rotate right (Immediate)

roriw rd, rs1, shamt

Rotate right Word (Immediate)

rorw rd, rs1, rs2

Rotate right Word (Register)

bclr rd, rs1, rs2

Single-Bit Clear (Register)

bclri rd, rs1, imm

Single-Bit Clear (Immediate)

bext rd, rs1, rs2

Single-Bit Extract (Register)

bexti rd, rs1, imm

Single-Bit Extract (Immediate)

binv rd, rs1, rs2

Single-Bit Invert (Register)

binvi rd, rs1, imm

Single-Bit Invert (Immediate)

bset rd, rs1, rs2

Single-Bit Set (Register)

bseti rd, rs1, imm

Single-Bit Set (Immediate)

sext.b rd, rs

Sign-extend byte

sext.h rd, rs

Sign-extend halfword

sh1add rd, rs1, rs2

Shift left by 1 and add

sh1add.uw rd, rs1, rs2

Shift unsigned word left by 1 and add

sh2add rd, rs1, rs2

Shift left by 2 and add

sh2add.uw rd, rs1, rs2

Shift unsigned word left by 2 and add

sh3add rd, rs1, rs2

Shift left by 3 and add

sh3add.uw rd, rs1, rs2

Shift unsigned word left by 3 and add

slli.uw rd, rs1, imm

Shift-left unsigned word (Immediate)

xnor rd, rs1, rs2

Exclusive NOR

zext.h rd, rs

Zero-extend halfword

-
-

30.4.1. Zba: Address generation

-
-

The Zba instructions can be used to accelerate the generation of addresses that index into arrays of basic types (halfword, word, doubleword) using both unsigned word-sized and XLEN-sized indices: a shifted index is added to a base address.

-
-
-

The shift and add instructions do a left shift of 1, 2, or 3 because these are commonly found in real-world code and because they can be implemented with a minimal amount of additional hardware beyond that of the simple adder. This avoids lengthening the critical path in implementations.

-
-
-

While the shift and add instructions are limited to a maximum left shift of 3, the slli instruction (from the base ISA) can be used to perform similar shifts for indexing into arrays of wider elements. The slli.uw — added in this extension — can be used when the index is to be interpreted as an unsigned word.

-
-
-

The following instructions (and pseudoinstructions) comprise the Zba extension:

-
- ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
RV32RV64MnemonicInstruction

add.uw rd, rs1, rs2

Add unsigned word

sh1add rd, rs1, rs2

Shift left by 1 and add

sh1add.uw rd, rs1, rs2

Shift unsigned word left by 1 and add

sh2add rd, rs1, rs2

Shift left by 2 and add

sh2add.uw rd, rs1, rs2

Shift unsigned word left by 2 and add

sh3add rd, rs1, rs2

Shift left by 3 and add

sh3add.uw rd, rs1, rs2

Shift unsigned word left by 3 and add

slli.uw rd, rs1, imm

Shift-left unsigned word (Immediate)

zext.w rd, rs

Add unsigned word

-
-
-

30.4.2. Zbb: Basic bit-manipulation

-
-
Logical with negate
- ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
RV32RV64MnemonicInstruction

andn rd, rs1, rs2

AND with inverted operand

orn rd, rs1, rs2

OR with inverted operand

xnor rd, rs1, rs2

Exclusive NOR

-
- - - - - -
- - -
Implementation Hint
-
-

The Logical with Negate instructions can be implemented by inverting the rs2 inputs to the base-required AND, OR, and XOR logic instructions. -In some implementations, the inverter on rs2 used for subtraction can be reused for this purpose.

-
-
-
-
-
-
Count leading/trailing zero bits
- ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
RV32RV64MnemonicInstruction

clz rd, rs

Count leading zero bits

clzw rd, rs

Count leading zero bits in word

ctz rd, rs

Count trailing zero bits

ctzw rd, rs

Count trailing zero bits in word

-
-
-
Count population
-
-

These instructions count the number of set bits (1-bits). This is also -commonly referred to as population count.

-
- ------ - - - - - - - - - - - - - - - - - - - - - - -
RV32RV64MnemonicInstruction

cpop rd, rs

Count set bits

cpopw rd, rs

Count set bits in word

-
-
-
Integer minimum/maximum
-
-

The integer minimum/maximum instructions are arithmetic R-type -instructions that return the smaller/larger of two operands.

-
- ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
RV32RV64MnemonicInstruction

max rd, rs1, rs2

Maximum

maxu rd, rs1, rs2

Unsigned maximum

min rd, rs1, rs2

Minimum

minu rd, rs1, rs2

Unsigned minimum

-
-
-
Sign extension and zero extension
-
-

These instructions perform the sign extension or zero extension of the least significant 8 bits or 16 bits of the source register.

-
-
-

These instructions replace the generalized idioms slli rD,rS,(XLEN-<size>) + srli (for zero extension) or slli + srai (for sign extension) for the sign extension of 8-bit and 16-bit quantities, and for the zero extension of 16-bit quantities.

-
- ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
RV32RV64MnemonicInstruction

sext.b rd, rs

Sign-extend byte

sext.h rd, rs

Sign-extend halfword

zext.h rd, rs

Zero-extend halfword

-
-
-
Bitwise rotation
-
-

Bitwise rotation instructions are similar to the shift-logical operations from the base spec. However, where the shift-logical -instructions shift in zeros, the rotate instructions shift in the bits that were shifted out of the other side of the value. -Such operations are also referred to as ‘circular shifts’.

-
- ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
RV32RV64MnemonicInstruction

rol rd, rs1, rs2

Rotate left (Register)

rolw rd, rs1, rs2

Rotate Left Word (Register)

ror rd, rs1, rs2

Rotate right (Register)

rori rd, rs1, shamt

Rotate right (Immediate)

roriw rd, rs1, shamt

Rotate right Word (Immediate)

rorw rd, rs1, rs2

Rotate right Word (Register)

-
- - - - - -
- - -
Architecture Explanation
-
-

The rotate instructions were included to replace a common -four-instruction sequence to achieve the same effect (neg; sll/srl; srl/sll; or)

-
-
-
-
-
-
OR Combine
-
-

orc.b sets the bits of each byte in the result rd to all zeros if no bit within the respective byte of rs is set, or to all ones if any bit within the respective byte of rs is set.

-
-
-

One use-case is string-processing functions, such as strlen and strcpy, which can use orc.b to test for the terminating zero byte by counting the set bits in leading non-zero bytes in a word.

-
- ------ - - - - - - - - - - - - - - - - -
RV32RV64MnemonicInstruction

orc.b rd, rs

Bitwise OR-Combine, byte granule

-
-
-
Byte-reverse
-
-

rev8 reverses the byte-ordering of rs.

-
- ------ - - - - - - - - - - - - - - - - -
RV32RV64MnemonicInstruction

rev8 rd, rs

Byte-reverse register

-
-
-
-

30.4.3. Zbc: Carry-less multiplication

-
-

Carry-less multiplication is the multiplication in the polynomial ring over GF(2).

-
-
-

clmul produces the lower half of the carry-less product and clmulh produces the upper half of the 2✕XLEN carry-less product.

-
-
-

clmulr produces bits 2✕XLEN−2:XLEN-1 of the 2✕XLEN carry-less product.

-
- ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
RV32RV64MnemonicInstruction

clmul rd, rs1, rs2

Carry-less multiply (low-part)

clmulh rd, rs1, rs2

Carry-less multiply (high-part)

clmulr rd, rs1, rs2

Carry-less multiply (reversed)

-
-
-

30.4.4. Zbs: Single-bit instructions

-
-

The single-bit instructions provide a mechanism to set, clear, invert, or extract -a single bit in a register. The bit is specified by its index.

-
- ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
RV32RV64MnemonicInstruction

bclr rd, rs1, rs2

Single-Bit Clear (Register)

bclri rd, rs1, imm

Single-Bit Clear (Immediate)

bext rd, rs1, rs2

Single-Bit Extract (Register)

bexti rd, rs1, imm

Single-Bit Extract (Immediate)

binv rd, rs1, rs2

Single-Bit Invert (Register)

binvi rd, rs1, imm

Single-Bit Invert (Immediate)

bset rd, rs1, rs2

Single-Bit Set (Register)

bseti rd, rs1, imm

Single-Bit Set (Immediate)

-
-
-

30.4.5. Zbkb: Bit-manipulation for Cryptography

-
-

This extension contains instructions essential for implementing -common operations in cryptographic workloads.

-
- ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
RV32RV64MnemonicInstruction

rol

Rotate left (Register)

rolw

Rotate Left Word (Register)

ror

Rotate right (Register)

rori

Rotate right (Immediate)

roriw

Rotate right Word (Immediate)

rorw

Rotate right Word (Register)

andn

AND with inverted operand

orn

OR with inverted operand

xnor

Exclusive NOR

pack

Pack low halves of registers

packh

Pack low bytes of registers

packw

Pack low 16-bits of registers (RV64)

rev.b

Reverse bits in bytes

rev8

Byte-reverse register

zip

Bit interleave

unzip

Bit deinterleave

-
-
-

30.4.6. Zbkc: Carry-less multiplication for Cryptography

-
-

Carry-less multiplication is the multiplication in the polynomial ring over -GF(2). This is a critical operation in some cryptographic workloads, -particularly the AES-GCM authenticated encryption scheme. -This extension provides only the instructions needed to -efficiently implement the GHASH operation, which is part of this workload.

-
- ------ - - - - - - - - - - - - - - - - - - - - - - -
RV32RV64MnemonicInstruction

clmul rd, rs1, rs2

Carry-less multiply (low-part)

clmulh rd, rs1, rs2

Carry-less multiply (high-part)

-
-
-

30.4.7. Zbkx: Crossbar permutations

-
-

These instructions implement a "lookup table" for 4 and 8 bit elements -inside the general purpose registers. -rs1 is used as a vector of N-bit words, and rs2 as a vector of N-bit -indices into rs1. -Elements in rs1 are replaced by the indexed element in rs2, or zero -if the index into rs2 is out of bounds.

-
-
-

These instructions are useful for expressing N-bit to N-bit boolean -operations, and implementing cryptographic code with secret -dependent memory accesses (particularly SBoxes) such that the execution -latency does not depend on the (secret) data being operated on.

-
- ------ - - - - - - - - - - - - - - - - - - - - - - -
RV32RV64MnemonicInstruction

xperm.n rd, rs1, rs2

Crossbar permutation (nibbles)

xperm.b rd, rs1, rs2

Crossbar permutation (bytes)

-
-
-
-
-

30.5. Instructions (in alphabetical order)

-
-

30.5.1. add.uw

-
-
-
Synopsis
-
-

Add unsigned word

-
-
Mnemonic
-
-

add.uw rd, rs1, rs2

-
-
Pseudoinstructions
-
-

zext.w rd, rs1 → add.uw rd, rs1, zero

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction performs an XLEN-wide addition between rs2 and the zero-extended least-significant word of rs1.

-
-
Operation
-
-
-
-
-
let base = X(rs2);
-let index = EXTZ(X(rs1)[31..0]);
-
-X(rd) = base + index;
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zba (Address generation instructions)

0.93

Ratified

-
-
-
-

30.5.2. andn

-
-
-
Synopsis
-
-

AND with inverted operand

-
-
Mnemonic
-
-

andn rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction performs the bitwise logical AND operation between rs1 and the bitwise inversion of rs2.

-
-
Operation
-
-
-
-
-
X(rd) = X(rs1) & ~X(rs2);
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

v1.0

Ratified

Zbkb (Bit-manipulation for Cryptography)

v1.0

Ratified

-
-
-
-

30.5.3. bclr

-
-
-
Synopsis
-
-

Single-Bit Clear (Register)

-
-
Mnemonic
-
-

bclr rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction returns rs1 with a single bit cleared at the index specified in rs2. -The index is read from the lower log2(XLEN) bits of rs2.

-
-
Operation
-
-
-
-
-
let index = X(rs2) & (XLEN - 1);
-X(rd) = X(rs1) & ~(1 << index)
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbs (Single-bit instructions)

v1.0

Ratified

-
-
-
-

30.5.4. bclri

-
-
-
Synopsis
-
-

Single-Bit Clear (Immediate)

-
-
Mnemonic
-
-

bclri rd, rs1, shamt

-
-
Encoding (RV32)
-
-
-
-
-Diagram -
-
-
-
-
Encoding (RV64)
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction returns rs1 with a single bit cleared at the index specified in shamt. -The index is read from the lower log2(XLEN) bits of shamt. -For RV32, the encodings corresponding to shamt[5]=1 are reserved.

-
-
Operation
-
-
-
-
-
let index = shamt & (XLEN - 1);
-X(rd) = X(rs1) & ~(1 << index)
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbs (Single-bit instructions)

v1.0

Ratified

-
-
-
-

30.5.5. bext

-
-
-
Synopsis
-
-

Single-Bit Extract (Register)

-
-
Mnemonic
-
-

bext rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction returns a single bit extracted from rs1 at the index specified in rs2. -The index is read from the lower log2(XLEN) bits of rs2.

-
-
Operation
-
-
-
-
-
let index = X(rs2) & (XLEN - 1);
-X(rd) = (X(rs1) >> index) & 1;
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbs (Single-bit instructions)

v1.0

Ratified

-
-
-
-

30.5.6. bexti

-
-
-
Synopsis
-
-

Single-Bit Extract (Immediate)

-
-
Mnemonic
-
-

bexti rd, rs1, shamt

-
-
Encoding (RV32)
-
-
-
-
-Diagram -
-
-
-
-
Encoding (RV64)
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction returns a single bit extracted from rs1 at the index specified in rs2. -The index is read from the lower log2(XLEN) bits of shamt. -For RV32, the encodings corresponding to shamt[5]=1 are reserved.

-
-
Operation
-
-
-
-
-
let index = shamt & (XLEN - 1);
-X(rd) = (X(rs1) >> index) & 1;
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbs (Single-bit instructions)

v1.0

Ratified

-
-
-
-

30.5.7. binv

-
-
-
Synopsis
-
-

Single-Bit Invert (Register)

-
-
Mnemonic
-
-

binv rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction returns rs1 with a single bit inverted at the index specified in rs2. -The index is read from the lower log2(XLEN) bits of rs2.

-
-
Operation
-
-
-
-
-
let index = X(rs2) & (XLEN - 1);
-X(rd) = X(rs1) ^ (1 << index)
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbs (Single-bit instructions)

v1.0

Ratified

-
-
-
-

30.5.8. binvi

-
-
-
Synopsis
-
-

Single-Bit Invert (Immediate)

-
-
Mnemonic
-
-

binvi rd, rs1, shamt

-
-
Encoding (RV32)
-
-
-
-
-Diagram -
-
-
-
-
Encoding (RV64)
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction returns rs1 with a single bit inverted at the index specified in shamt. -The index is read from the lower log2(XLEN) bits of shamt. -For RV32, the encodings corresponding to shamt[5]=1 are reserved.

-
-
Operation
-
-
-
-
-
let index = shamt & (XLEN - 1);
-X(rd) = X(rs1) ^ (1 << index)
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbs (Single-bit instructions)

v1.0

Ratified

-
-
-
-

30.5.9. bset

-
-
-
Synopsis
-
-

Single-Bit Set (Register)

-
-
Mnemonic
-
-

bset rd, rs1,rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction returns rs1 with a single bit set at the index specified in rs2. -The index is read from the lower log2(XLEN) bits of rs2.

-
-
Operation
-
-
-
-
-
let index = X(rs2) & (XLEN - 1);
-X(rd) = X(rs1) | (1 << index)
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbs (Single-bit instructions)

v1.0

Ratified

-
-
-
-

30.5.10. bseti

-
-
-
Synopsis
-
-

Single-Bit Set (Immediate)

-
-
Mnemonic
-
-

bseti rd, rs1,shamt

-
-
Encoding (RV32)
-
-
-
-
-Diagram -
-
-
-
-
Encoding (RV64)
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction returns rs1 with a single bit set at the index specified in shamt. -The index is read from the lower log2(XLEN) bits of shamt. -For RV32, the encodings corresponding to shamt[5]=1 are reserved.

-
-
Operation
-
-
-
-
-
let index = shamt & (XLEN - 1);
-X(rd) = X(rs1) | (1 << index)
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbs (Single-bit instructions)

v1.0

Ratified

-
-
-
-

30.5.11. clmul

-
-
-
Synopsis
-
-

Carry-less multiply (low-part)

-
-
Mnemonic
-
-

clmul rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

clmul produces the lower half of the 2·XLEN carry-less product.

-
-
Operation
-
-
-
-
-
let rs1_val = X(rs1);
-let rs2_val = X(rs2);
-let output : xlenbits = 0;
-
-foreach (i from 0 to (xlen - 1) by 1) {
-   output = if   ((rs2_val >> i) & 1)
-            then output ^ (rs1_val << i);
-            else output;
-}
-
-X[rd] = output
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbc (Carry-less multiplication)

v1.0

Ratified

Zbkc (Carry-less multiplication for Cryptography)

v1.0

Ratified

-
-
-
-

30.5.12. clmulh

-
-
-
Synopsis
-
-

Carry-less multiply (high-part)

-
-
Mnemonic
-
-

clmulh rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

clmulh produces the upper half of the 2·XLEN carry-less product.

-
-
Operation
-
-
-
-
-
let rs1_val = X(rs1);
-let rs2_val = X(rs2);
-let output : xlenbits = 0;
-
-foreach (i from 1 to xlen by 1) {
-   output = if   ((rs2_val >> i) & 1)
-            then output ^ (rs1_val >> (xlen - i));
-            else output;
-}
-
-X[rd] = output
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbc (Carry-less multiplication)

v1.0

Ratified

Zbkc (Carry-less multiplication for Cryptography)

v1.0

Ratified

-
-
-
-

30.5.13. clmulr

-
-
-
Synopsis
-
-

Carry-less multiply (reversed)

-
-
Mnemonic
-
-

clmulr rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

clmulr produces bits 2·XLEN−2:XLEN-1 of the 2·XLEN carry-less -product.

-
-
Operation
-
-
-
-
-
let rs1_val = X(rs1);
-let rs2_val = X(rs2);
-let output : xlenbits = 0;
-
-foreach (i from 0 to (xlen - 1) by 1) {
-   output = if   ((rs2_val >> i) & 1)
-            then output ^ (rs1_val >> (xlen - i - 1));
-            else output;
-}
-
-X[rd] = output
-
-
-
- - - - - -
- - -
Note
-
-

The clmulr instruction is used to accelerate CRC calculations. -The r in the instruction’s mnemonic stands for reversed, as the -instruction is equivalent to bit-reversing the inputs, performing -a clmul, then bit-reversing the output.

-
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbc (Carry-less multiplication)

v1.0

Ratified

-
-
-
-

30.5.14. clz

-
-
-
Synopsis
-
-

Count leading zero bits

-
-
Mnemonic
-
-

clz rd, rs

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction counts the number of 0’s before the first 1, starting at the most-significant bit (i.e., XLEN-1) and progressing to bit 0. Accordingly, if the input is 0, the output is XLEN, and if the most-significant bit of the input is a 1, the output is 0.

-
-
Operation
-
-
-
-
-
val HighestSetBit : forall ('N : Int), 'N >= 0. bits('N) -> int
-
-function HighestSetBit x = {
-  foreach (i from (xlen - 1) to 0 by 1 in dec)
-    if [x[i]] == 0b1 then return(i) else ();
-  return -1;
-}
-
-let rs = X(rs);
-X[rd] = (xlen - 1) - HighestSetBit(rs);
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

v1.0

Ratified

-
-
-
-

30.5.15. clzw

-
-
-
Synopsis
-
-

Count leading zero bits in word

-
-
Mnemonic
-
-

clzw rd, rs

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction counts the number of 0’s before the first 1 starting at bit 31 and progressing to bit 0. -Accordingly, if the least-significant word is 0, the output is 32, and if the most-significant bit of the word (i.e., bit 31) is a 1, the output is 0.

-
-
Operation
-
-
-
-
-
val HighestSetBit32 : forall ('N : Int), 'N >= 0. bits('N) -> int
-
-function HighestSetBit32 x = {
-  foreach (i from 31 to 0 by 1 in dec)
-    if [x[i]] == 0b1 then return(i) else ();
-  return -1;
-}
-
-let rs = X(rs);
-X[rd] = 31 - HighestSetBit(rs);
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

v1.0

Ratified

-
-
-
-

30.5.16. cpop

-
-
-
Synopsis
-
-

Count set bits

-
-
Mnemonic
-
-

cpop rd, rs

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instructions counts the number of 1’s (i.e., set bits) in the source register.

-
-
Operation
-
-
-
-
-
let bitcount = 0;
-let rs = X(rs);
-
-foreach (i from 0 to (xlen - 1) in inc)
-    if rs[i] == 0b1 then bitcount = bitcount + 1 else ();
-
-X[rd] = bitcount
-
-
-
- - - - - -
- - -
Software Hint
-
-

This operations is known as population count, popcount, sideways sum, bit summation, or Hamming weight.

-
-
-

The GCC builtin function __builtin_popcount (unsigned int x) is implemented by cpop on RV32 and by cpopw on RV64. -The GCC builtin function __builtin_popcountl (unsigned long x) for LP64 is implemented by cpop on RV64.

-
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

v1.0

Ratified

-
-
-
-

30.5.17. cpopw

-
-
-
Synopsis
-
-

Count set bits in word

-
-
Mnemonic
-
-

cpopw rd, rs

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instructions counts the number of 1’s (i.e., set bits) in the least-significant word of the source register.

-
-
Operation
-
-
-
-
-
let bitcount = 0;
-let val = X(rs);
-
-foreach (i from 0 to 31 in inc)
-    if val[i] == 0b1 then bitcount = bitcount + 1 else ();
-
-X[rd] = bitcount
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

v1.0

Ratified

-
-
-
-

30.5.18. ctz

-
-
-
Synopsis
-
-

Count trailing zeros

-
-
Mnemonic
-
-

ctz rd, rs

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction counts the number of 0’s before the first 1, starting at the least-significant bit (i.e., 0) and progressing to the most-significant bit (i.e., XLEN-1). -Accordingly, if the input is 0, the output is XLEN, and if the least-significant bit of the input is a 1, the output is 0.

-
-
Operation
-
-
-
-
-
val LowestSetBit : forall ('N : Int), 'N >= 0. bits('N) -> int
-
-function LowestSetBit x = {
-  foreach (i from 0 to (xlen - 1) by 1 in dec)
-    if [x[i]] == 0b1 then return(i) else ();
-  return xlen;
-}
-
-let rs = X(rs);
-X[rd] = LowestSetBit(rs);
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

v1.0

Ratified

-
-
-
-

30.5.19. ctzw

-
-
-
Synopsis
-
-

Count trailing zero bits in word

-
-
Mnemonic
-
-

ctzw rd, rs

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction counts the number of 0’s before the first 1, starting at the least-significant bit (i.e., 0) and progressing to the most-significant bit of the least-significant word (i.e., 31). Accordingly, if the least-significant word is 0, the output is 32, and if the least-significant bit of the input is a 1, the output is 0.

-
-
Operation
-
-
-
-
-
val LowestSetBit32 : forall ('N : Int), 'N >= 0. bits('N) -> int
-
-function LowestSetBit32 x = {
-  foreach (i from 0 to 31 by 1 in dec)
-    if [x[i]] == 0b1 then return(i) else ();
-  return 32;
-}
-
-let rs = X(rs);
-X[rd] = LowestSetBit32(rs);
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

v1.0

Ratified

-
-
-
-

30.5.20. max

-
-
-
Synopsis
-
-

Maximum

-
-
Mnemonic
-
-

max rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction returns the larger of two signed integers.

-
-
Operation
-
-
-
-
-
let rs1_val = X(rs1);
-let rs2_val = X(rs2);
-
-let result = if   rs1_val <_s rs2_val
-             then rs2_val
-             else rs1_val;
-
-X(rd) = result;
-
-
-
- - - - - -
- - -
Software Hint
-
-

Calculating the absolute value of a signed integer can be performed -using the following sequence: neg rD,rS followed by max -rD,rS,rD. When using this common sequence, it is suggested that they -are scheduled with no intervening instructions so that -implementations that are so optimized can fuse them together.

-
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

v1.0

Ratified

-
-
-
-

30.5.21. maxu

-
-
-
Synopsis
-
-

Unsigned maximum

-
-
Mnemonic
-
-

maxu rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction returns the larger of two unsigned integers.

-
-
Operation
-
-
-
-
-
let rs1_val = X(rs1);
-let rs2_val = X(rs2);
-
-let result = if   rs1_val <_u rs2_val
-             then rs2_val
-             else rs1_val;
-
-X(rd) = result;
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

v1.0

Ratified

-
-
-
-

30.5.22. min

-
-
-
Synopsis
-
-

Minimum

-
-
Mnemonic
-
-

min rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction returns the smaller of two signed integers.

-
-
Operation
-
-
-
-
-
let rs1_val = X(rs1);
-let rs2_val = X(rs2);
-
-let result = if   rs1_val <_s rs2_val
-             then rs1_val
-             else rs2_val;
-
-X(rd) = result;
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

v1.0

Ratified

-
-
-
-

30.5.23. minu

-
-
-
Synopsis
-
-

Unsigned minimum

-
-
Mnemonic
-
-

minu rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction returns the smaller of two unsigned integers.

-
-
Operation
-
-
-
-
-
let rs1_val = X(rs1);
-let rs2_val = X(rs2);
-
-let result = if   rs1_val <_u rs2_val
-             then rs1_val
-             else rs2_val;
-
-X(rd) = result;
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

v1.0

Ratified

-
-
-
-

30.5.24. orc.b

-
-
-
Synopsis
-
-

Bitwise OR-Combine, byte granule

-
-
Mnemonic
-
-

orc.b rd, rs

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

Combines the bits within each byte using bitwise logical OR. -This sets the bits of each byte in the result rd to all zeros if no bit within the respective byte of rs is set, or to all ones if any bit within the respective byte of rs is set.

-
-
Operation
-
-
-
-
-
let input = X(rs);
-let output : xlenbits = 0;
-
-foreach (i from 0 to (xlen - 8) by 8) {
-   output[(i + 7)..i] = if   input[(i + 7)..i] == 0
-                        then 0b00000000
-                        else 0b11111111;
-}
-
-X[rd] = output;
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

v1.0

Ratified

-
-
-
-

30.5.25. orn

-
-
-
Synopsis
-
-

OR with inverted operand

-
-
Mnemonic
-
-

orn rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction performs the bitwise logical OR operation between rs1 and the bitwise inversion of rs2.

-
-
Operation
-
-
-
-
-
X(rd) = X(rs1) | ~X(rs2);
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

v1.0

Ratified

Zbkb (Bit-manipulation for Cryptography)

v1.0

Ratified

-
-
-
-

30.5.26. pack

-
-
-
Synopsis
-
-

Pack the low halves of rs1 and rs2 into rd.

-
-
Mnemonic
-
-

pack rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

The pack instruction packs the XLEN/2-bit lower halves of rs1 and rs2 into -rd, with rs1 in the lower half and rs2 in the upper half.

-
-
Operation
-
-
-
-
-
let lo_half : bits(xlen/2) = X(rs1)[xlen/2-1..0];
-let hi_half : bits(xlen/2) = X(rs2)[xlen/2-1..0];
-X(rd) = EXTZ(hi_half @ lo_half);
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbkb (Bit-manipulation for Cryptography)

v1.0

Ratified

-
-
-
-

30.5.27. packh

-
-
-
Synopsis
-
-

Pack the low bytes of rs1 and rs2 into rd.

-
-
Mnemonic
-
-

packh rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

And the packh instruction packs the least-significant bytes of -rs1 and rs2 into the 16 least-significant bits of rd, -zero extending the rest of rd.

-
-
Operation
-
-
-
-
-
let lo_half : bits(8) = X(rs1)[7..0];
-let hi_half : bits(8) = X(rs2)[7..0];
-X(rd) = EXTZ(hi_half @ lo_half);
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbkb (Bit-manipulation for Cryptography)

v1.0

Ratified

-
-
-
-

30.5.28. packw

-
-
-
Synopsis
-
-

Pack the low 16-bits of rs1 and rs2 into rd on RV64.

-
-
Mnemonic
-
-

packw rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction packs the low 16 bits of -rs1 and rs2 into the 32 least-significant bits of rd, -sign extending the 32-bit result to the rest of rd. -This instruction only exists on RV64 based systems.

-
-
Operation
-
-
-
-
-
let lo_half : bits(16) = X(rs1)[15..0];
-let hi_half : bits(16) = X(rs2)[15..0];
-X(rd) = EXTS(hi_half @ lo_half);
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbkb (Bit-manipulation for Cryptography)

v1.0

Ratified

-
-
-
-

30.5.29. rev8

-
-
-
Synopsis
-
-

Byte-reverse register

-
-
Mnemonic
-
-

rev8 rd, rs

-
-
Encoding (RV32)
-
-
-
-
-Diagram -
-
-
-
-
Encoding (RV64)
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction reverses the order of the bytes in rs.

-
-
Operation
-
-
-
-
-
let input = X(rs);
-let output : xlenbits = 0;
-let j = xlen - 1;
-
-foreach (i from 0 to (xlen - 8) by 8) {
-   output[i..(i + 7)] = input[(j - 7)..j];
-   j = j - 8;
-}
-
-X[rd] = output
-
-
-
- - - - - -
- - -
Note
-
-

The rev8 mnemonic corresponds to different instruction encodings in RV32 and RV64.

-
-
-
-
- - - - - -
- - -
Software Hint
-
-

The byte-reverse operation is only available for the full register -width. To emulate word-sized and halfword-sized byte-reversal, -perform a rev8 rd,rs followed by a srai rd,rd,K, where K is -XLEN-32 and XLEN-16, respectively.

-
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

v1.0

Ratified

Zbkb (Bit-manipulation for Cryptography)

v1.0

Ratified

-
-
-
-

30.5.30. rev.b

-
-
-
Synopsis
-
-

Reverse the bits in each byte of a source register.

-
-
Mnemonic
-
-

rev.b rd, rs

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction reverses the order of the bits in every byte of a register.

-
-
Operation
-
-
-
-
-
result : xlenbits = EXTZ(0b0);
-foreach (i from 0 to sizeof(xlen) by 8) {
-    result[i+7..i] = reverse_bits_in_byte(X(rs1)[i+7..i]);
-};
-X(rd) = result;
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbkb (Bit-manipulation for Cryptography)

v1.0

Ratified

-
-
-
-

30.5.31. rol

-
-
-
Synopsis
-
-

Rotate Left (Register)

-
-
Mnemonic
-
-

rol rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction performs a rotate left of rs1 by the amount in least-significant log2(XLEN) bits of rs2.

-
-
Operation
-
-
-
-
-
let shamt = if   xlen == 32
-            then X(rs2)[4..0]
-            else X(rs2)[5..0];
-let result = (X(rs1) << shamt) | (X(rs1) >> (xlen - shamt));
-
-X(rd) = result;
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

0.93

Ratified

Zbkb (Bit-manipulation for Cryptography)

v1.0

Ratified

-
-
-
-

30.5.32. rolw

-
-
-
Synopsis
-
-

Rotate Left Word (Register)

-
-
Mnemonic
-
-

rolw rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction performs a rotate left on the least-significant word of rs1 by the amount in least-significant 5 bits of rs2. -The resulting word value is sign-extended by copying bit 31 to all of the more-significant bits.

-
-
Operation
-
-
-
-
-
let rs1 = EXTZ(X(rs1)[31..0])
-let shamt = X(rs2)[4..0];
-let result = (rs1 << shamt) | (rs1 >> (32 - shamt));
-X(rd) = EXTS(result[31..0]);
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

0.93

Ratified

Zbkb (Bit-manipulation for Cryptography)

v1.0

Ratified

-
-
-
-

30.5.33. ror

-
-
-
Synopsis
-
-

Rotate Right

-
-
Mnemonic
-
-

ror rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction performs a rotate right of rs1 by the amount in least-significant log2(XLEN) bits of rs2.

-
-
Operation
-
-
-
-
-
let shamt = if   xlen == 32
-            then X(rs2)[4..0]
-            else X(rs2)[5..0];
-let result = (X(rs1) >> shamt) | (X(rs1) << (xlen - shamt));
-
-X(rd) = result;
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

0.93

Ratified

Zbkb (Bit-manipulation for Cryptography)

v1.0

Ratified

-
-
-
-

30.5.34. rori

-
-
-
Synopsis
-
-

Rotate Right (Immediate)

-
-
Mnemonic
-
-

rori rd, rs1, shamt

-
-
Encoding (RV32)
-
-
-
-
-Diagram -
-
-
-
-
Encoding (RV64)
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction performs a rotate right of rs1 by the amount in the least-significant log2(XLEN) bits of shamt. -For RV32, the encodings corresponding to shamt[5]=1 are reserved.

-
-
Operation
-
-
-
-
-
let shamt = if   xlen == 32
-            then shamt[4..0]
-            else shamt[5..0];
-let result = (X(rs1) >> shamt) | (X(rs1) << (xlen - shamt));
-
-X(rd) = result;
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

0.93

Ratified

Zbkb (Bit-manipulation for Cryptography)

v1.0

Ratified

-
-
-
-

30.5.35. roriw

-
-
-
Synopsis
-
-

Rotate Right Word by Immediate

-
-
Mnemonic
-
-

roriw rd, rs1, shamt

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction performs a rotate right on the least-significant word -of rs1 by the amount in the least-significant log2(XLEN) bits of -shamt. -The resulting word value is sign-extended by copying bit 31 to all of -the more-significant bits.

-
-
Operation
-
-
-
-
-
let rs1_data = EXTZ(X(rs1)[31..0];
-let result = (rs1_data >> shamt) | (rs1_data << (32 - shamt));
-X(rd) = EXTS(result[31..0]);
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

0.93

Ratified

Zbkb (Bit-manipulation for Cryptography)

v1.0

Ratified

-
-
-
-

30.5.36. rorw

-
-
-
Synopsis
-
-

Rotate Right Word (Register)

-
-
Mnemonic
-
-

rorw rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction performs a rotate right on the least-significant word of rs1 by the amount in least-significant 5 bits of rs2. -The resultant word is sign-extended by copying bit 31 to all of the more-significant bits.

-
-
Operation
-
-
-
-
-
let rs1 = EXTZ(X(rs1)[31..0])
-let shamt = X(rs2)[4..0];
-let result = (rs1 >> shamt) | (rs1 << (32 - shamt));
-X(rd) = EXTS(result);
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

0.93

Ratified

Zbkb (Bit-manipulation for Cryptography)

v1.0

Ratified

-
-
-
-

30.5.37. sext.b

-
-
-
Synopsis
-
-

Sign-extend byte

-
-
Mnemonic
-
-

sext.b rd, rs

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction sign-extends the least-significant byte in the source to XLEN by copying the most-significant bit in the byte (i.e., bit 7) to all of the more-significant bits.

-
-
Operation
-
-
-
-
-
X(rd) = EXTS(X(rs)[7..0]);
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

0.93

Ratified

-
-
-
-

30.5.38. sext.h

-
-
-
Synopsis
-
-

Sign-extend halfword

-
-
Mnemonic
-
-

sext.h rd, rs

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction sign-extends the least-significant halfword in rs to XLEN by copying the most-significant bit in the halfword (i.e., bit 15) to all of the more-significant bits.

-
-
Operation
-
-
-
-
-
X(rd) = EXTS(X(rs)[15..0]);
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

0.93

Ratified

-
-
-
-

30.5.39. sh1add

-
-
-
Synopsis
-
-

Shift left by 1 and add

-
-
Mnemonic
-
-

sh1add rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction shifts rs1 to the left by 1 bit and adds it to rs2.

-
-
Operation
-
-
-
-
-
X(rd) = X(rs2) + (X(rs1) << 1);
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zba (Address generation instructions)

0.93

Ratified

-
-
-
-

30.5.40. sh1add.uw

-
-
-
Synopsis
-
-

Shift unsigned word left by 1 and add

-
-
Mnemonic
-
-

sh1add.uw rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction performs an XLEN-wide addition of two addends. -The first addend is rs2. The second addend is the unsigned value formed by extracting the least-significant word of rs1 and shifting it left by 1 place.

-
-
Operation
-
-
-
-
-
let base = X(rs2);
-let index = EXTZ(X(rs1)[31..0]);
-
-X(rd) = base + (index << 1);
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zba (Address generation instructions)

0.93

Ratified

-
-
-
-

30.5.41. sh2add

-
-
-
Synopsis
-
-

Shift left by 2 and add

-
-
Mnemonic
-
-

sh2add rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction shifts rs1 to the left by 2 places and adds it to rs2.

-
-
Operation
-
-
-
-
-
X(rd) = X(rs2) + (X(rs1) << 2);
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zba (Address generation instructions)

0.93

Ratified

-
-
-
-

30.5.42. sh2add.uw

-
-
-
Synopsis
-
-

Shift unsigned word left by 2 and add

-
-
Mnemonic
-
-

sh2add.uw rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction performs an XLEN-wide addition of two addends. -The first addend is rs2. -The second addend is the unsigned value formed by extracting the least-significant word of rs1 and shifting it left by 2 places.

-
-
Operation
-
-
-
-
-
let base = X(rs2);
-let index = EXTZ(X(rs1)[31..0]);
-
-X(rd) = base + (index << 2);
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zba (Address generation instructions)

0.93

Ratified

-
-
-
-

30.5.43. sh3add

-
-
-
Synopsis
-
-

Shift left by 3 and add

-
-
Mnemonic
-
-

sh3add rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction shifts rs1 to the left by 3 places and adds it to rs2.

-
-
Operation
-
-
-
-
-
X(rd) = X(rs2) + (X(rs1) << 3);
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zba (Address generation instructions)

0.93

Ratified

-
-
-
-

30.5.44. sh3add.uw

-
-
-
Synopsis
-
-

Shift unsigned word left by 3 and add

-
-
Mnemonic
-
-

sh3add.uw rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction performs an XLEN-wide addition of two addends. The first addend is rs2. The second addend is the unsigned value formed by extracting the least-significant word of rs1 and shifting it left by 3 places.

-
-
Operation
-
-
-
-
-
let base = X(rs2);
-let index = EXTZ(X(rs1)[31..0]);
-
-X(rd) = base + (index << 3);
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zba (Address generation instructions)

0.93

Ratified

-
-
-
-

30.5.45. slli.uw

-
-
-
Synopsis
-
-

Shift-left unsigned word (Immediate)

-
-
Mnemonic
-
-

slli.uw rd, rs1, shamt

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction takes the least-significant word of rs1, zero-extends it, and shifts it left by the immediate.

-
-
Operation
-
-
-
-
-
X(rd) = (EXTZ(X(rs)[31..0]) << shamt);
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zba (Address generation instructions)

0.93

Ratified

-
- - - - - -
- - -
Architecture Explanation
-
-

This instruction is the same as slli with zext.w performed on rs1 before shifting.

-
-
-
-
-
-
-

30.5.46. unzip

-
-
-
Synopsis
-
-

Implements the inverse of the zip instruction.

-
-
Mnemonic
-
-

unzip rd, rs

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction gathers bits from the high and low halves of the source -word into odd/even bit positions in the destination word. -It is the inverse of the zip instruction. -This instruction is available only on RV32.

-
-
Operation
-
-
-
-
-
foreach (i from 0 to xlen/2-1) {
-  X(rd)[i] = X(rs1)[2*i]
-  X(rd)[i+xlen/2] = X(rs1)[2*i+1]
-}
-
-
-
- - - - - -
- - -
Software Hint
-
-

This instruction is useful for implementing the SHA3 cryptographic -hash function on a 32-bit architecture, as it implements the -bit-interleaving operation used to speed up the 64-bit rotations -directly.

-
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbkb (Bit-manipulation for Cryptography) (RV32)

v1.0

Ratified

-
-
-
-

30.5.47. xnor

-
-
-
Synopsis
-
-

Exclusive NOR

-
-
Mnemonic
-
-

xnor rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction performs the bit-wise exclusive-NOR operation on rs1 and rs2.

-
-
Operation
-
-
-
-
-
X(rd) = ~(X(rs1) ^ X(rs2));
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

0.93

Ratified

Zbkb (Bit-manipulation for Cryptography)

v1.0

Ratified

-
-
-
-

30.5.48. xperm.b

-
-
-
Synopsis
-
-

Byte-wise lookup of indices into a vector in registers.

-
-
Mnemonic
-
-

xperm.b rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

The xperm.b instruction operates on bytes. -The rs1 register contains a vector of XLEN/8 8-bit elements. -The rs2 register contains a vector of XLEN/8 8-bit indexes. -The result is each element in rs2 replaced by the indexed element in rs1, -or zero if the index into rs2 is out of bounds.

-
-
Operation
-
-
-
-
-
val xpermb_lookup : (bits(8), xlenbits) -> bits(8)
-function xpermb_lookup (idx, lut) = {
-    (lut >> (idx @ 0b000))[7..0]
-}
-
-function clause execute ( XPERM_B (rs2,rs1,rd)) = {
-    result : xlenbits = EXTZ(0b0);
-    foreach(i from 0 to xlen by 8) {
-        result[i+7..i] = xpermn_lookup(X(rs2)[i+7..i], X(rs1));
-    };
-    X(rd) = result;
-    RETIRE_SUCCESS
-}
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbkx (Crossbar permutations)

v1.0

Ratified

-
-
-
-

30.5.49. xperm.n

-
-
-
Synopsis
-
-

Nibble-wise lookup of indices into a vector.

-
-
Mnemonic
-
-

xperm.n rd, rs1, rs2

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

The xperm.n instruction operates on nibbles. -The rs1 register contains a vector of XLEN/4 4-bit elements. -The rs2 register contains a vector of XLEN/4 4-bit indexes. -The result is each element in rs2 replaced by the indexed element in rs1, -or zero if the index into rs2 is out of bounds.

-
-
Operation
-
-
-
-
-
val xpermn_lookup : (bits(4), xlenbits) -> bits(4)
-function xpermn_lookup (idx, lut) = {
-    (lut >> (idx @ 0b00))[3..0]
-}
-
-function clause execute ( XPERM_N (rs2,rs1,rd)) = {
-    result : xlenbits = EXTZ(0b0);
-    foreach(i from 0 to xlen by 4) {
-        result[i+3..i] = xpermn_lookup(X(rs2)[i+3..i], X(rs1));
-    };
-    X(rd) = result;
-    RETIRE_SUCCESS
-}
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbkx (Crossbar permutations)

v1.0

Ratified

-
-
-
-

30.5.50. zext.h

-
-
-
Synopsis
-
-

Zero-extend halfword

-
-
Mnemonic
-
-

zext.h rd, rs

-
-
Encoding (RV32)
-
-
-
-
-Diagram -
-
-
-
-
Encoding (RV64)
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction zero-extends the least-significant halfword of the source to XLEN by inserting 0’s into all of the bits more significant than 15.

-
-
Operation
-
-
-
-
-
X(rd) = EXTZ(X(rs)[15..0]);
-
-
-
- - - - - -
- - -
Note
-
-

The zext.h mnemonic corresponds to different instruction encodings in RV32 and RV64.

-
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbb (Basic bit-manipulation)

0.93

Ratified

-
-
-
-

30.5.51. zip

-
-
-
Synopsis
-
-

Gather odd and even bits of the source word into upper/lower halves of the -destination.

-
-
Mnemonic
-
-

zip rd, rs

-
-
Encoding
-
-
-
-
-Diagram -
-
-
-
-
Description
-
-

This instruction scatters all of the odd and even bits of a source word into -the high and low halves of a destination word. -It is the inverse of the unzip instruction. -This instruction is available only on RV32.

-
-
Operation
-
-
-
-
-
foreach (i from 0 to xlen/2-1) {
-  X(rd)[2*i] = X(rs1)[i]
-  X(rd)[2*i+1] = X(rs1)[i+xlen/2]
-}
-
-
-
- - - - - -
- - -
Software Hint
-
-

This instruction is useful for implementing the SHA3 cryptographic -hash function on a 32-bit architecture, as it implements the -bit-interleaving operation used to speed up the 64-bit rotations -directly.

-
-
-
-
-
-
Included in
-
-
- ----- - - - - - - - - - - - - - - -
ExtensionMinimum versionLifecycle state

Zbkb (Bit-manipulation for Cryptography) (RV32)

v1.0

Ratified

-
-
-
-

30.6. Software optimization guide

-
-

30.6.1. strlen

-
-

The orc.b instruction allows for the efficient detection of NUL bytes in an XLEN-sized chunk of data:

-
-
-
    -
  • -

    the result of orc.b on a chunk that does not contain any NUL bytes will be all-ones, and

    -
  • -
  • -

    after a bitwise-negation of the result of orc.b, the number of data bytes before the first NUL byte (if any) can be detected by ctz/clz (depending on the endianness of data).

    -
  • -
-
-
-

A full example of a strlen function, which uses these techniques and also demonstrates the use of it for unaligned/partial data, is the following:

-
-
-
-
#include <sys/asm.h>
-
-	.text
-	.globl strlen
-	.type  strlen, @function
-strlen:
-	andi	a3, a0, (SZREG-1)   // offset
-	andi    a1, a0, -SZREG      // align pointer
-.Lprologue:
-	li      a4, SZREG
-	sub     a4, a4, a3          // XLEN - offset
-	slli	a3, a3, 3           // offset * 8
-	REG_L   a2, 0(a1)           // chunk
-	/*
-	 * Shift the partial/unaligned chunk we loaded to remove the bytes
-	 * from before the start of the string, adding NUL bytes at the end.
-	 */
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-	srl	a2, a2 ,a3          // chunk >> (offset * 8)
-#else
-	sll     a2, a2, a3
-#endif
-	orc.b   a2, a2
-	not	a2, a2
-	/*
-	 * Non-NUL bytes in the string have been expanded to 0x00, while
- 	 * NUL bytes have become 0xff.  Search for the first set bit
-	 * (corresponding to a NUL byte in the original chunk).
-	 */
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-	ctz     a2, a2
-#else
-	clz     a2, a2
-#endif
-	/*
-	 * The first chunk is special: compare against the number of valid
-	 * bytes in this chunk.
-	 */
-	srli    a0, a2, 3
-	bgtu    a4, a0, .Ldone
-	addi    a3, a1, SZREG
-	li      a4, -1
-	.align 2
-	/*
-	 * Our critical loop is 4 instructions and processes data in 4 byte
-	 * or 8 byte chunks.
-	 */
-.Lloop:
-	REG_L   a2, SZREG(a1)
-	addi    a1, a1, SZREG
-	orc.b   a2, a2
-	beq     a2, a4, .Lloop
-
-.Lepilogue:
-	not     a2, a2
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-	ctz     a2, a2
-#else
-	clz     a2, a2
-#endif
-	sub     a1, a1, a3
-	add	a0, a0, a1
-	srli    a2, a2, 3
-	add 	a0, a0, a2
-.Ldone:
-	ret
-
-
-
-
-

30.6.2. strcmp

-
-
-
#include <sys/asm.h>
-
-  .text
-  .globl strcmp
-  .type  strcmp, @function
-strcmp:
-  or    a4, a0, a1
-  li    t2, -1
-  and   a4, a4, SZREG-1
-  bnez  a4, .Lsimpleloop
-
-  # Main loop for aligned strings
-.Lloop:
-  REG_L a2, 0(a0)
-  REG_L a3, 0(a1)
-  orc.b t0, a2
-  bne   t0, t2, .Lfoundnull
-  addi  a0, a0, SZREG
-  addi  a1, a1, SZREG
-  beq   a2, a3, .Lloop
-
-  # Words don't match, and no null byte in first word.
-  # Get bytes in big-endian order and compare.
-#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
-  rev8  a2, a2
-  rev8  a3, a3
-#endif
-  # Synthesize (a2 >= a3) ? 1 : -1 in a branchless sequence.
-  sltu a0, a2, a3
-  neg  a0, a0
-  ori  a0, a0, 1
-  ret
-
-.Lfoundnull:
-  # Found a null byte.
-  # If words don't match, fall back to simple loop.
-  bne   a2, a3, .Lsimpleloop
-
-  # Otherwise, strings are equal.
-  li    a0, 0
-  ret
-
-  # Simple loop for misaligned strings
-.Lsimpleloop:
-  lbu   a2, 0(a0)
-  lbu   a3, 0(a1)
-  addi  a0, a0, 1
-  addi  a1, a1, 1
-  bne   a2, a3, 1f
-  bnez  a2, .Lsimpleloop
-
-1:
-  sub   a0, a2, a3
-  ret
-
-.size   strcmp, .-strcmp
-
-
-
-
-
-
-
-

31. "J" Extension for Dynamically Translated Languages, Version 0.0

-
-
-

This chapter is a placeholder for a future standard extension to support -dynamically translated languages.

-
-
- - - - - -
- - -
-

Many popular languages are usually implemented via dynamic translation, -including Java and Javascript. These languages can benefit from -additional ISA support for dynamic checks and garbage collection.

-
-
-
-
-
-
-

32. "P" Extension for Packed-SIMD Instructions, Version 0.2

-
-
- - - - - -
- - -
-

Discussions at the 5th RISC-V workshop indicated a desire to drop this -packed-SIMD proposal for floating-point registers in favor of -standardizing on the V extension for large floating-point SIMD -operations. However, there was interest in packed-SIMD fixed-point -operations for use in the integer registers of small RISC-V -implementations. A task group is working to define the new P extension.

-
-
-
-
-
-
-

33. "V" Standard Extension for Vector Operations, Version 1.0

-
- -
-
-
-

34. Cryptography Extensions: Scalar & Entropy Source Instructions, Version 1.0.1

-
- -
-
-
-

35. Cryptography Extensions: Vector Instructions, Version 1.0

-
- -
-
-
-

36. Control-flow Integrity (CFI)

-
-
-

CV64A6_MMU: The Zicfiss extension is not supported.

-
-
-

CV64A6_MMU: The Zicfilp extension is not supported.

-
-
-
-
-

37. RV32/64G Instruction Set Listings

-
-
-

One goal of the RISC-V project is that it be used as a stable software -development target. For this purpose, we define a combination of a base -ISA (RV32I or RV64I) plus selected standard extensions (IMAFD, Zicsr, -Zifencei) as a "general-purpose" ISA, and we use the abbreviation G -for the IMAFDZicsr_Zifencei combination of instruction-set extensions. -This chapter presents opcode maps and instruction-set listings for RV32G -and RV64G.

-
-
-

CV64A6_MMU: This chapter presents opcode maps and instruction-set -listings for CV64A6_MMU.

-
- - ----------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 27. RISC-V base opcode map, inst[1:0]=11

inst[4:2]

000

001

010

011

100

101

110

111 (>32b)

inst[6:5]

00

LOAD

LOAD-FP

custom-0

MISC-MEM

OP-IMM

AUIPC

OP-IMM-32

48b

01

STORE

STORE-FP

custom-1

AMO

OP

LUI

OP-32

64b

10

MADD

MSUB

NMSUB

NMADD

OP-FP

OP-V

custom-2/rv128

48b

11

BRANCH

JALR

reserved

JAL

SYSTEM

OP-VE

custom-3/rv128

≥80b

-
-

Table 27 shows a map of the major opcodes for -RVG. Major opcodes with 3 or more lower bits set are reserved for -instruction lengths greater than 32 bits. Opcodes marked as reserved -should be avoided for custom instruction-set extensions as they might be -used by future standard extensions. Major opcodes marked as custom-0 -and custom-1 will be avoided by future standard extensions and are -recommended for use by custom instruction-set extensions within the base -32-bit instruction format. The opcodes marked custom-2/rv128 and -custom-3/rv128 are reserved for future use by RV128, but will -otherwise be avoided for standard extensions and so can also be used for -custom instruction-set extensions in RV32 and RV64.

-
-
-

We believe RV32G and RV64G provide simple but complete instruction sets -for a broad range of general-purpose computing. The optional compressed -instruction set described in Chapter 28 can -be added (forming RV32GC and RV64GC) to improve performance, code size, -and energy efficiency, though with some additional hardware complexity.

-
-
-

As we move beyond IMAFDC into further instruction-set extensions, the -added instructions tend to be more domain-specific and only provide -benefits to a restricted class of applications, e.g., for multimedia or -security. Unlike most commercial ISAs, the RISC-V ISA design clearly -separates the base ISA and broadly applicable standard extensions from -these more specialized additions. Chapter 38 -has a more extensive discussion of ways to add extensions to the RISC-V -ISA.

-
-
- ----------------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

31

27

26

25

24

20

19

15

14

12

11

7

6

0

funct7

rs2

rs1

funct3

rd

opcode

R-type

imm[11:0]

rs1

funct3

rd

opcode

I-type

imm[11:5]

rs2

rs1

funct3

imm[4:0]

opcode

S-type

imm[12|10:5]

rs2

rs1

funct3

imm[4:1|11]

opcode

B-type

imm[31:12]

rd

opcode

U-type

imm[20|10:1|11|19:12]

rd

opcode

J-type

- ----------------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

RV32I Base Instruction Set

imm[31:12]

rd

0110111

LUI

imm[31:12]

rd

0010111

AUIPC

imm[20|10:1|11|19:12]

rd

1101111

JAL

imm[11:0]

rs1

000

rd

1100111

JALR

imm[12|10:5]

rs2

rs1

000

imm[4:1|11]

1100011

BEQ

imm[12|10:5]

rs2

rs1

001

imm[4:1|11]

1100011

BNE

imm[12|10:5]

rs2

rs1

100

imm[4:1|11]

1100011

BLT

imm[12|10:5]

rs2

rs1

101

imm[4:1|11]

1100011

BGE

imm[12|10:5]

rs2

rs1

110

imm[4:1|11]

1100011

BLTU

imm[12|10:5]

rs2

rs1

111

imm[4:1|11]

1100011

BGEU

imm[11:0]

rs1

000

rd

0000011

LB

imm[11:0]

rs1

001

rd

0000011

LH

imm[11:0]

rs1

010

rd

0000011

LW

imm[11:0]

rs1

100

rd

0000011

LBU

imm[11:0]

rs1

101

rd

0000011

LHU

imm[11:5]

rs2

rs1

000

imm[4:0]

0100011

SB

imm[11:5]

rs2

rs1

001

imm[4:0]

0100011

SH

imm[11:5]

rs2

rs1

010

imm[4:0]

0100011

SW

imm[11:0]

rs1

000

rd

0010011

ADDI

imm[11:0]

rs1

010

rd

0010011

SLTI

imm[11:0]

rs1

011

rd

0010011

SLTIU

imm[11:0]

rs1

100

rd

0010011

XORI

imm[11:0]

rs1

110

rd

0010011

ORI

imm[11:0]

rs1

111

rd

0010011

ANDI

0000000

shamt

rs1

001

rd

0010011

SLLI

0000000

shamt

rs1

101

rd

0010011

SRLI

0100000

shamt

rs1

101

rd

0010011

SRAI

0000000

rs2

rs1

000

rd

0110011

ADD

0100000

rs2

rs1

000

rd

0110011

SUB

0000000

rs2

rs1

001

rd

0110011

SLL

0000000

rs2

rs1

010

rd

0110011

SLT

0000000

rs2

rs1

011

rd

0110011

SLTU

0000000

rs2

rs1

100

rd

0110011

XOR

0000000

rs2

rs1

101

rd

0110011

SRL

0100000

rs2

rs1

101

rd

0110011

SRA

0000000

rs2

rs1

110

rd

0110011

OR

0000000

rs2

rs1

111

rd

0110011

AND

fm

pred

succ

rs1

000

rd

0001111

FENCE

1000

0011

0011

00000

000

00000

0001111

FENCE.TSO

0000

0001

0000

00000

000

00000

0001111

PAUSE

000000000000

00000

000

00000

1110011

ECALL

000000000001

00000

000

00000

1110011

EBREAK

-
- ----------------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

31

27

26

25

24

20

19

15

14

12

11

7

6

0

funct7

rs2

rs1

funct3

rd

opcode

R-type

imm[11:0]

rs1

funct3

rd

opcode

I-type

imm[11:5]

rs2

rs1

funct3

imm[4:0]

opcode

S-type

- ----------------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

RV64I Base Instruction Set (in addition to RV32I)

imm[11:0]

rs1

110

rd

0000011

LWU

imm[11:0]

rs1

011

rd

0000011

LD

imm[11:5]

rs2

rs1

011

imm[4:0]

0100011

SD

000000

shamt

rs1

001

rd

0010011

SLLI

000000

shamt

rs1

101

rd

0010011

SRLI

010000

shamt

rs1

101

rd

0010011

SRAI

imm[11:0]

rs1

000

rd

0011011

ADDIW

0000000

shamt

rs1

001

rd

0011011

SLLIW

0000000

shamt

rs1

101

rd

0011011

SRLIW

0100000

shamt

rs1

101

rd

0011011

SRAIW

0000000

rs2

rs1

000

rd

0111011

ADDW

0100000

rs2

rs1

000

rd

0111011

SUBW

0000000

rs2

rs1

001

rd

0111011

SLLW

0000000

rs2

rs1

101

rd

0111011

SRLW

0100000

rs2

rs1

101

rd

0111011

SRAW

- ----------------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

RV32/RV64 Zicsr Standard Extension

csr

rs1

001

rd

1110011

CSRRW

csr

rs1

010

rd

1110011

CSRRS

csr

rs1

011

rd

1110011

CSRRC

csr

uimm

101

rd

1110011

CSRRWI

csr

uimm

110

rd

1110011

CSRRSI

csr

uimm

111

rd

1110011

CSRRCI

- ----------------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

RV32M Standard Extension

0000001

rs2

rs1

000

rd

0110011

MUL

0000001

rs2

rs1

001

rd

0110011

MULH

0000001

rs2

rs1

010

rd

0110011

MULHSU

0000001

rs2

rs1

011

rd

0110011

MULHU

0000001

rs2

rs1

100

rd

0110011

DIV

0000001

rs2

rs1

101

rd

0110011

DIVU

0000001

rs2

rs1

110

rd

0110011

REM

0000001

rs2

rs1

111

rd

0110011

REMU

- ----------------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

RV64M Standard Extension (in addition to RV32M)

0000001

rs2

rs1

000

rd

0111011

MULW

0000001

rs2

rs1

100

rd

0111011

DIVW

0000001

rs2

rs1

101

rd

0111011

DIVUW

0000001

rs2

rs1

110

rd

0111011

REMW

0000001

rs2

rs1

111

rd

0111011

REMUW

-
-
-

Table 28 lists the CSRs that have currently been -allocated CSR addresses. The timers, counters, and floating-point CSRs -are the only CSRs defined in this specification.

-
- - ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 28. RISC-V control and status register (CSR) address map.
NumberPrivilegeNameDescription

Floating-Point Control and Status Registers

0x001

Read write

fflags

Floating-Point Accrued Exceptions.

0x002

Read write

frm

Floating-Point Dynamic Rounding Mode.

0x003

Read write

fcsr

Floating-Point Control and Status Register (frm + fflags).

Counters and Timers

0xC00

Read-only

cycle

Cycle counter for RDCYCLE instruction.

0xC01

Read-only

time

Timer for RDTIME instruction.

0xC02

Read-only

instret

Instructions-retired counter for RDINSTRET instruction.

0xC80

Read-only

cycleh

Upper 32 bits of cycle, RV32I only.

0xC81

Read-only

timeh

Upper 32 bits of time, RV32I only.

0xC82

Read-only

instreth

Upper 32 bits of instret, RV32I only.

-
-
-
-

38. Extending RISC-V

-
-
-

In addition to supporting standard general-purpose software development, -another goal of RISC-V is to provide a basis for more specialized -instruction-set extensions or more customized accelerators. The -instruction encoding spaces and optional variable-length instruction -encoding are designed to make it easier to leverage software development -effort for the standard ISA toolchain when building more customized -processors. For example, the intent is to continue to provide full -software support for implementations that only use the standard I base, -perhaps together with many non-standard instruction-set extensions.

-
-
-

This chapter describes various ways in which the base RISC-V ISA can be -extended, together with the scheme for managing instruction-set -extensions developed by independent groups. This volume only deals with -the unprivileged ISA, although the same approach and terminology is used -for supervisor-level extensions described in the second volume.

-
-
-

38.1. Extension Terminology

-
-

This section defines some standard terminology for describing RISC-V -extensions.

-
-
-

38.1.1. Standard versus Non-Standard Extension

-
-

Any RISC-V processor implementation must support a base integer ISA -(RV32I, RV32E, RV64I, RV64E, or RV128I). In addition, an implementation may -support one or more extensions. We divide extensions into two broad -categories: standard versus non-standard.

-
-
-
    -
  • -

    A standard extension is one that is generally useful and that is -designed to not conflict with any other standard extension. Currently, -"MAFDQCBTPV", described in other chapters of this manual, are either -complete or planned standard extensions.

    -
  • -
  • -

    A non-standard extension may be highly specialized and may conflict -with other standard or non-standard extensions. We anticipate a wide -variety of non-standard extensions will be developed over time, with -some eventually being promoted to standard extensions.

    -
  • -
-
-
-
-

38.1.2. Instruction Encoding Spaces and Prefixes

-
-

An instruction encoding space is some number of instruction bits within -which a base ISA or ISA extension is encoded. RISC-V supports varying -instruction lengths, but even within a single instruction length, there -are various sizes of encoding space available. For example, the base -ISAs are defined within a 30-bit encoding space (bits 31-2 of the 32-bit -instruction), while the atomic extension "A" fits within a 25-bit -encoding space (bits 31-7).

-
-
-

We use the term prefix to refer to the bits to the right of an -instruction encoding space (since instruction fetch in RISC-V is -little-endian, the bits to the right are stored at earlier memory -addresses, hence form a prefix in instruction-fetch order). The prefix -for the standard base ISA encoding is the two-bit "11" field held in -bits 1-0 of the 32-bit word, while the prefix for the standard atomic -extension "A" is the seven-bit "0101111" field held in bits 6-0 of -the 32-bit word representing the AMO major opcode. A quirk of the -encoding format is that the 3-bit funct3 field used to encode a minor -opcode is not contiguous with the major opcode bits in the 32-bit -instruction format, but is considered part of the prefix for 22-bit -instruction spaces.

-
-
-

Although an instruction encoding space could be of any size, adopting a -smaller set of common sizes simplifies packing independently developed -extensions into a single global encoding. -Table 29 gives the suggested sizes for RISC-V.

-
- - -------- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 29. Suggested standard RISC-V instruction encoding space sizes.
SizeUsage# Available in standard instruction length

16-bit

32-bit

48-bit

64-bit

14-bit

Quadrant of compressed 16-bit encoding

3

22-bit

Minor opcode in base 32-bit encoding

stem c4baab0af7a156d93ff31ad113e2dde5

stem 3d91b2575aebc0d742415f31a7138964

stem a5ee56a7cd1f96b794dd9fd33c17b806

25-bit

Major opcode in base 32-bit encoding

32

stem cec3b648d9a79b7951e2288a4cb712cd

stem 4077d56d7f287ffbdb80f419a66226a9

30-bit

Quadrant of base 32-bit encoding

1

stem 6a484de5eab2c8d47af90cbe3ffedf6e

stem 840e9073a2fd95fa3f885a3e28972b40

32-bit

Minor opcode in 48-bit encoding

stem 560d002b3cbfbdc26102752336160ae5

stem 106e24e6dc15f353b281ecc18a55b796

37-bit

Major opcode in 48-bit encoding

32

stem 3d91b2575aebc0d742415f31a7138964

40-bit

Quadrant of 48-bit encoding

4

stem cec3b648d9a79b7951e2288a4cb712cd

45-bit

Sub-minor opcode in 64-bit encoding

stem 6a484de5eab2c8d47af90cbe3ffedf6e

48-bit

Minor opcode in 64-bit encoding

stem 96cee3dcfd74a2467659d99ac60808d7

52-bit

Major opcode in 64-bit encoding

32

-
-
-

38.1.3. Greenfield versus Brownfield Extensions

-
-

We use the term greenfield extension to describe an extension that -begins populating a new instruction encoding space, and hence can only -cause encoding conflicts at the prefix level. We use the term -brownfield extension to describe an extension that fits around -existing encodings in a previously defined instruction space. A -brownfield extension is necessarily tied to a particular greenfield -parent encoding, and there may be multiple brownfield extensions to the -same greenfield parent encoding. For example, the base ISAs are -greenfield encodings of a 30-bit instruction space, while the FDQ -floating-point extensions are all brownfield extensions adding to the -parent base ISA 30-bit encoding space.

-
-
-

Note that we consider the standard A extension to have a greenfield -encoding as it defines a new previously empty 25-bit encoding space in -the leftmost bits of the full 32-bit base instruction encoding, even -though its standard prefix locates it within the 30-bit encoding space -of its parent base ISA. Changing only its single 7-bit prefix could move -the A extension to a different 30-bit encoding space while only worrying -about conflicts at the prefix level, not within the encoding space -itself.

-
- - ----- - - - - - - - - - - - - - - - - - - - -
Table 30. Two-dimensional characterization of standard instruction-set extensions.
Adds stateNo new state

Greenfield

RV32I(30), RV64I(30)

A(25)

Brownfield

F(I), D(F), Q(D)

M(I)

-
-

Table 30 shows the bases and standard extensions placed -in a simple two-dimensional taxonomy. One axis is whether the extension -is greenfield or brownfield, while the other axis is whether the -extension adds architectural state. For greenfield extensions, the size -of the instruction encoding space is given in parentheses. For -brownfield extensions, the name of the extension (greenfield or -brownfield) it builds upon is given in parentheses. Additional -user-level architectural state usually implies changes to the -supervisor-level system or possibly to the standard calling convention.

-
-
-

Note that RV64I is not considered an extension of RV32I, but a different -complete base encoding.

-
-
-
-

38.1.4. Standard-Compatible Global Encodings

-
-

A complete or global encoding of an ISA for an actual RISC-V -implementation must allocate a unique non-conflicting prefix for every -included instruction encoding space. The bases and every standard -extension have each had a standard prefix allocated to ensure they can -all coexist in a global encoding.

-
-
-

A standard-compatible global encoding is one where the base and every -included standard extension have their standard prefixes. A -standard-compatible global encoding can include non-standard extensions -that do not conflict with the included standard extensions. A -standard-compatible global encoding can also use standard prefixes for -non-standard extensions if the associated standard extensions are not -included in the global encoding. In other words, a standard extension -must use its standard prefix if included in a standard-compatible global -encoding, but otherwise its prefix is free to be reallocated. These -constraints allow a common toolchain to target the standard subset of -any RISC-V standard-compatible global encoding.

-
-
-
-

38.1.5. Guaranteed Non-Standard Encoding Space

-
-

To support development of proprietary custom extensions, portions of the -encoding space are guaranteed to never be used by standard extensions.

-
-
-
-
-

38.2. RISC-V Extension Design Philosophy

-
-

We intend to support a large number of independently developed -extensions by encouraging extension developers to operate within -instruction encoding spaces, and by providing tools to pack these into a -standard-compatible global encoding by allocating unique prefixes. Some -extensions are more naturally implemented as brownfield augmentations of -existing extensions, and will share whatever prefix is allocated to -their parent greenfield extension. The standard extension prefixes avoid -spurious incompatibilities in the encoding of core functionality, while -allowing custom packing of more esoteric extensions.

-
-
-

This capability of repacking RISC-V extensions into different -standard-compatible global encodings can be used in a number of ways.

-
-
-

One use-case is developing highly specialized custom accelerators, -designed to run kernels from important application domains. These might -want to drop all but the base integer ISA and add in only the extensions -that are required for the task in hand. The base ISAs have been designed -to place minimal requirements on a hardware implementation, and has been -encoded to use only a small fraction of a 32-bit instruction encoding -space.

-
-
-

Another use-case is to build a research prototype for a new type of -instruction-set extension. The researchers might not want to expend the -effort to implement a variable-length instruction-fetch unit, and so -would like to prototype their extension using a simple 32-bit -fixed-width instruction encoding. However, this new extension might be -too large to coexist with standard extensions in the 32-bit space. If -the research experiments do not need all of the standard extensions, a -standard-compatible global encoding might drop the unused standard -extensions and reuse their prefixes to place the proposed extension in a -non-standard location to simplify engineering of the research prototype. -Standard tools will still be able to target the base and any standard -extensions that are present to reduce development time. Once the -instruction-set extension has been evaluated and refined, it could then -be made available for packing into a larger variable-length encoding -space to avoid conflicts with all standard extensions.

-
-
-

The following sections describe increasingly sophisticated strategies -for developing implementations with new instruction-set extensions. -These are mostly intended for use in highly customized, educational, or -experimental architectures rather than for the main line of RISC-V ISA -development.

-
-
-
-

38.3. Extensions within fixed-width 32-bit instruction format

-
-

In this section, we discuss adding extensions to implementations that -only support the base fixed-width 32-bit instruction format.

-
-
- - - - - -
- - -
-

We anticipate the simplest fixed-width 32-bit encoding will be popular -for many restricted accelerators and research prototypes.

-
-
-
-
-

38.3.1. Available 30-bit instruction encoding spaces

-
-

In the standard encoding, three of the available 30-bit instruction -encoding spaces (those with 2-bit prefixes 00, 01, and 10) are used to -enable the optional compressed instruction extension. However, if the -compressed instruction-set extension is not required, then these three -further 30-bit encoding spaces become available. This quadruples the -available encoding space within the 32-bit format.

-
-
-
-

38.3.2. Available 25-bit instruction encoding spaces

-
-

A 25-bit instruction encoding space corresponds to a major opcode in the -base and standard extension encodings.

-
-
-

There are four major opcodes expressly designated for custom extensions -Table 27, each of which represents a 25-bit -encoding space. Two of these are reserved for eventual use in the RV128 -base encoding (will be OP-IMM-64 and OP-64), but can be used for -non-standard extensions for RV32 and RV64.

-
-
-

The two major opcodes reserved for RV64 (OP-IMM-32 and OP-32) can also -be used for non-standard extensions to RV32 only.

-
-
-

If an implementation does not require floating-point, then the seven -major opcodes reserved for standard floating-point extensions (LOAD-FP, -STORE-FP, MADD, MSUB, NMSUB, NMADD, OP-FP) can be reused for -non-standard extensions. Similarly, the AMO major opcode can be reused -if the standard atomic extensions are not required.

-
-
-

If an implementation does not require instructions longer than 32-bits, -then an additional four major opcodes are available (those marked in -gray in Table 27).

-
-
-

The base RV32I encoding uses only 11 major opcodes plus 3 reserved -opcodes, leaving up to 18 available for extensions. The base RV64I -encoding uses only 13 major opcodes plus 3 reserved opcodes, leaving up -to 16 available for extensions.

-
-
-
-

38.3.3. Available 22-bit instruction encoding spaces

-
-

A 22-bit encoding space corresponds to a funct3 minor opcode space in -the base and standard extension encodings. Several major opcodes have a -funct3 field minor opcode that is not completely occupied, leaving -available several 22-bit encoding spaces.

-
-
-

Usually a major opcode selects the format used to encode operands in the -remaining bits of the instruction, and ideally, an extension should -follow the operand format of the major opcode to simplify hardware -decoding.

-
-
-
-

38.3.4. Other spaces

-
-

Smaller spaces are available under certain major opcodes, and not all -minor opcodes are entirely filled.

-
-
-
-
-

38.4. Adding aligned 64-bit instruction extensions

-
-

The simplest approach to provide space for extensions that are too large -for the base 32-bit fixed-width instruction format is to add naturally -aligned 64-bit instructions. The implementation must still support the -32-bit base instruction format, but can require that 64-bit instructions -are aligned on 64-bit boundaries to simplify instruction fetch, with a -32-bit NOP instruction used as alignment padding where necessary.

-
-
-

To simplify use of standard tools, the 64-bit instructions should be -encoded as described in Table 1. -However, an implementation might choose a non-standard -instruction-length encoding for 64-bit instructions, while retaining the -standard encoding for 32-bit instructions. For example, if compressed -instructions are not required, then a 64-bit instruction could be -encoded using one or more zero bits in the first two bits of an -instruction.

-
-
- - - - - -
- - -
-

We anticipate processor generators that produce instruction-fetch units -capable of automatically handling any combination of supported -variable-length instruction encodings.

-
-
-
-
-
-

38.5. Supporting VLIW encodings

-
-

Although RISC-V was not designed as a base for a pure VLIW machine, VLIW -encodings can be added as extensions using several alternative -approaches. In all cases, the base 32-bit encoding has to be supported -to allow use of any standard software tools.

-
-
-

38.5.1. Fixed-size instruction group

-
-

The simplest approach is to define a single large naturally aligned -instruction format (e.g., 128 bits) within which VLIW operations are -encoded. In a conventional VLIW, this approach would tend to waste -instruction memory to hold NOPs, but a RISC-V-compatible implementation -would have to also support the base 32-bit instructions, confining the -VLIW code size expansion to VLIW-accelerated functions.

-
-
-
-

38.5.2. Encoded-Length Groups

-
-

Another approach is to use the standard length encoding from -Table 1 to encode parallel -instruction groups, allowing NOPs to be compressed out of the VLIW -instruction. For example, a 64-bit instruction could hold two 28-bit -operations, while a 96-bit instruction could hold three 28-bit -operations, and so on. Alternatively, a 48-bit instruction could hold -one 42-bit operation, while a 96-bit instruction could hold two 42-bit -operations, and so on.

-
-
-

This approach has the advantage of retaining the base ISA encoding for -instructions holding a single operation, but has the disadvantage of -requiring a new 28-bit or 42-bit encoding for operations within the VLIW -instructions, and misaligned instruction fetch for larger groups. One -simplification is to not allow VLIW instructions to straddle certain -microarchitecturally significant boundaries (e.g., cache lines or -virtual memory pages).

-
-
-
-

38.5.3. Fixed-Size Instruction Bundles

-
-

Another approach, similar to Itanium, is to use a larger naturally -aligned fixed instruction bundle size (e.g., 128 bits) across which -parallel operation groups are encoded. This simplifies instruction -fetch, but shifts the complexity to the group execution engine. To -remain RISC-V compatible, the base 32-bit instruction would still have -to be supported.

-
-
-
-

38.5.4. End-of-Group bits in Prefix

-
-

None of the above approaches retains the RISC-V encoding for the -individual operations within a VLIW instruction. Yet another approach is -to repurpose the two prefix bits in the fixed-width 32-bit encoding. One -prefix bit can be used to signal "end-of-group" if set, while the -second bit could indicate execution under a predicate if clear. Standard -RISC-V 32-bit instructions generated by tools unaware of the VLIW -extension would have both prefix bits set (11) and thus have the correct -semantics, with each instruction at the end of a group and not -predicated.

-
-
-

The main disadvantage of this approach is that the base ISAs lack the -complex predication support usually required in an aggressive VLIW -system, and it is difficult to add space to specify more predicate -registers in the standard 30-bit encoding space.

-
-
-
-
-
-
-

39. ISA Extension Naming Conventions

-
-
-

This chapter describes the RISC-V ISA extension naming scheme that is -used to concisely describe the set of instructions present in a hardware -implementation, or the set of instructions used by an application binary -interface (ABI).

-
-
- - - - - -
- - -
-

The RISC-V ISA is designed to support a wide variety of implementations -with various experimental instruction-set extensions. We have found that -an organized naming scheme simplifies software tools and documentation.

-
-
-
-
-

39.1. Case Sensitivity

-
-

The ISA naming strings are case insensitive.

-
-
-
-

39.2. Base Integer ISA

-
-

RISC-V ISA strings begin with either RV32I, RV32E, RV64I, RV64E, or RV128I -indicating the supported address space size in bits for the base integer -ISA.

-
-
-
-

39.3. Instruction-Set Extension Names

-
-

Standard ISA extensions are given a name consisting of a single letter. -For example, the first four standard extensions to the integer bases -are: "M" for integer multiplication and division, "A" for atomic -memory instructions, "F" for single-precision floating-point -instructions, and "D" for double-precision floating-point -instructions. Any RISC-V instruction-set variant can be succinctly -described by concatenating the base integer prefix with the names of the -included extensions, e.g., "RV64IMAFD".

-
-
-

We have also defined an abbreviation "G" to represent the -"IMAFDZicsr_Zifencei" base and extensions, as this is intended to -represent our standard general-purpose ISA.

-
-
-

Standard extensions to the RISC-V ISA are given other reserved letters, -e.g., "Q" for quad-precision floating-point, or "C" for the 16-bit -compressed instruction format.

-
-
-

Some ISA extensions depend on the presence of other extensions, e.g., -"D" depends on "F" and "F" depends on "Zicsr". These dependencies -may be implicit in the ISA name: for example, RV32IF is equivalent to -RV32IFZicsr, and RV32ID is equivalent to RV32IFD and RV32IFDZicsr.

-
-
-
-

39.4. Version Numbers

-
-

Recognizing that instruction sets may expand or alter over time, we -encode extension version numbers following the extension name. Version -numbers are divided into major and minor version numbers, separated by a -"p". If the minor version is "0", then "p0" can be omitted from -the version string. Changes in major version numbers imply a loss of -backwards compatibility, whereas changes in only the minor version -number must be backwards-compatible. For example, the original 64-bit -standard ISA defined in release 1.0 of this manual can be written in -full as "RV64I1p0M1p0A1p0F1p0D1p0", more concisely as -"RV64I1M1A1F1D1".

-
-
-

We introduced the version numbering scheme with the second release. -Hence, we define the default version of a standard extension to be the -version present at that time, e.g., "RV32I" is equivalent to -"RV32I2".

-
-
-
-

39.5. Underscores

-
-

Underscores "_" may be used to separate ISA extensions to improve -readability and to provide disambiguation, e.g., "RV32I2_M2_A2".

-
-
-

Because the "P" extension for Packed SIMD can be confused for the -decimal point in a version number, it must be preceded by an underscore -if it follows a number. For example, "rv32i2p2" means version 2.2 of -RV32I, whereas "rv32i2_p2" means version 2.0 of RV32I with version 2.0 -of the P extension.

-
-
-
-

39.6. Additional Standard Unprivileged Extension Names

-
-

Standard unprivileged extensions can also be named using a single "Z" followed by -an alphabetical name and an optional version number. For example, -"Zifencei" names the instruction-fetch fence extension described in -Chapter 6; "Zifencei2" and -"Zifencei2p0" name version 2.0 of same.

-
-
-

The first letter following the "Z" conventionally indicates the most -closely related alphabetical extension category, IMAFDQLCBKJTPVH. For the -"Zfa" extension for additional floating-point instructions, for example, the letter "f" -indicates the extension is related to the "F" standard extension. If -multiple "Z" extensions are named, they should be ordered first by -category, then alphabetically within a category—for example, -"Zicsr_Zifencei_Zam".

-
-
-

All multi-letter extensions, including those with the "Z" prefix, must be -separated from other multi-letter extensions by an underscore, e.g., -"RV32IMACZicsr_Zifencei".

-
-
-
-

39.7. Supervisor-level Instruction-Set Extensions

-
-

Standard extensions that extend the supervisor-level virtual-memory -architecture are prefixed with the letters "Sv", followed by an alphabetical -name and an optional version number, or by a numeric name with no version number. -Other standard extensions that extend -the supervisor-level architecture are prefixed with the letters "Ss", -followed by an alphabetical name and an optional version number. Such -extensions are defined in Volume II.

-
-
-

Standard supervisor-level extensions should be listed after standard -unprivileged extensions. If multiple supervisor-level extensions are -listed, they should be ordered alphabetically.

-
-
-
-

39.8. Hypervisor-level Instruction-Set Extensions

-
-

Standard extensions that extend the hypervisor-level architecture are prefixed -with the letters "Sh". -If multiple hypervisor-level extensions are listed, they should be ordered -alphabetically.

-
-
- - - - - -
- - -Many augmentations to the hypervisor-level archtecture are more -naturally defined as supervisor-level extensions, following the scheme -described in the previous section. -The "Sh" prefix is used by the few hypervisor-level extensions that have no -supervisor-visible effects. -
-
-
-
-

39.9. Machine-level Instruction-Set Extensions

-
-

Standard machine-level instruction-set extensions are prefixed with the -letters "Sm".

-
-
-

Standard machine-level extensions should be listed after standard -lesser-privileged extensions. If multiple machine-level extensions are -listed, they should be ordered alphabetically.

-
-
-
-

39.10. Non-Standard Extension Names

-
-

Non-standard extensions are named using a single "X" followed by an -alphabetical name and an optional version number. For example, -"Xhwacha" names the Hwacha vector-fetch ISA extension; "Xhwacha2" -and "Xhwacha2p0" name version 2.0 of same.

-
-
-

Non-standard extensions must be listed after all standard extensions, and, -like other multi-letter extensions, must be separated from other multi-letter -extensions by an underscore. -For example, an ISA with non-standard extensions Argle and -Bargle may be named "RV64IZifencei_Xargle_Xbargle".

-
-
-

If multiple non-standard extensions are listed, they should be ordered -alphabetically.

-
-
-
-

39.11. Subset Naming Convention

-
-

Table 31 summarizes the standardized extension -names. The table also defines the canonical -order in which extension names must appear in the name string, with -top-to-bottom in table indicating first-to-last in the name string, -e.g., RV32IMACV is legal, whereas RV32IMAVC is not.

-
- - ----- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 31. Standard ISA extension names.
SubsetNameImplies

Base ISA

Integer

I

Reduced Integer

E

Standard Unprivileged Extensions

Integer Multiplication and Division

M

Zmmul

Atomics

A

Single-Precision Floating-Point

F

Zicsr

Double-Precision Floating-Point

D

F

General

G

IMAFDZicsr_Zifencei

Quad-Precision Floating-Point

Q

D

16-bit Compressed Instructions

C

B Extension

B

Packed-SIMD Extensions

P

Vector Extension

V

D

Hypervisor Extension

H

Additional Standard Unprivileged Extensions

Additional Standard unprivileged extensions "abc"

Zabc

Standard Supervisor-Level Extensions

Supervisor-level extension "def"

Ssdef

Standard Machine-Level Extensions

Machine-level extension "jkl"

Smjkl

Non-Standard Extensions

Non-standard extension "mno"

Xmno

-
-
-
-
-

40. History and Acknowledgments

-
-
-

40.1. "Why Develop a new ISA?" Rationale from Berkeley Group

-
-

We developed RISC-V to support our own needs in research and education, -where our group is particularly interested in actual hardware -implementations of research ideas (we have completed eleven different -silicon fabrications of RISC-V since the first edition of this -specification), and in providing real implementations for students to -explore in classes (RISC-V processor RTL designs have been used in -multiple undergraduate and graduate classes at Berkeley). In our current -research, we are especially interested in the move towards specialized -and heterogeneous accelerators, driven by the power constraints imposed -by the end of conventional transistor scaling. We wanted a highly -flexible and extensible base ISA around which to build our research -effort.

-
-
-

A question we have been repeatedly asked is "Why develop a new ISA?" -The biggest obvious benefit of using an existing commercial ISA is the -large and widely supported software ecosystem, both development tools -and ported applications, which can be leveraged in research and -teaching. Other benefits include the existence of large amounts of -documentation and tutorial examples. However, our experience of using -commercial instruction sets for research and teaching is that these -benefits are smaller in practice, and do not outweigh the disadvantages:

-
-
-
    -
  • -

    Commercial ISAs are proprietary. Except for SPARC V8, which is an -open IEEE standard (IEEE Standard for a 32-Bit Microprocessor, 1994) , most owners of commercial ISAs carefully guard -their intellectual property and do not welcome freely available -competitive implementations. This is much less of an issue for academic -research and teaching using only software simulators, but has been a -major concern for groups wishing to share actual RTL implementations. It -is also a major concern for entities who do not want to trust the few -sources of commercial ISA implementations, but who are prohibited from -creating their own clean room implementations. We cannot guarantee that -all RISC-V implementations will be free of third-party patent -infringements, but we can guarantee we will not attempt to sue a RISC-V -implementor.

    -
  • -
  • -

    Commercial ISAs are only popular in certain market domains. The most -obvious examples at time of writing are that the ARM architecture is not -well supported in the server space, and the Intel x86 architecture (or -for that matter, almost every other architecture) is not well supported -in the mobile space, though both Intel and ARM are attempting to enter -each other’s market segments. Another example is ARC and Tensilica, -which provide extensible cores but are focused on the embedded space. -This market segmentation dilutes the benefit of supporting a particular -commercial ISA as in practice the software ecosystem only exists for -certain domains, and has to be built for others.

    -
  • -
  • -

    Commercial ISAs come and go. Previous research infrastructures have -been built around commercial ISAs that are no longer popular (SPARC, -MIPS) or even no longer in production (Alpha). These lose the benefit of -an active software ecosystem, and the lingering intellectual property -issues around the ISA and supporting tools interfere with the ability of -interested third parties to continue supporting the ISA. An open ISA -might also lose popularity, but any interested party can continue using -and developing the ecosystem.

    -
  • -
  • -

    Popular commercial ISAs are complex. The dominant commercial ISAs -(x86 and ARM) are both very complex to implement in hardware to the -level of supporting common software stacks and operating systems. Worse, -nearly all the complexity is due to bad, or at least outdated, ISA -design decisions rather than features that truly improve efficiency.

    -
  • -
  • -

    Commercial ISAs alone are not enough to bring up applications. Even -if we expend the effort to implement a commercial ISA, this is not -enough to run existing applications for that ISA. Most applications need -a complete ABI (application binary interface) to run, not just the -user-level ISA. Most ABIs rely on libraries, which in turn rely on -operating system support. To run an existing operating system requires -implementing the supervisor-level ISA and device interfaces expected by -the OS. These are usually much less well-specified and considerably more -complex to implement than the user-level ISA.

    -
  • -
  • -

    Popular commercial ISAs were not designed for extensibility. The -dominant commercial ISAs were not particularly designed for -extensibility, and as a consequence have added considerable instruction -encoding complexity as their instruction sets have grown. Companies such -as Tensilica (acquired by Cadence) and ARC (acquired by Synopsys) have -built ISAs and toolchains around extensibility, but have focused on -embedded applications rather than general-purpose computing systems.

    -
  • -
  • -

    A modified commercial ISA is a new ISA. One of our main goals is to -support architecture research, including major ISA extensions. Even -small extensions diminish the benefit of using a standard ISA, as -compilers have to be modified and applications rebuilt from source code -to use the extension. Larger extensions that introduce new architectural -state also require modifications to the operating system. Ultimately, -the modified commercial ISA becomes a new ISA, but carries along all the -legacy baggage of the base ISA.

    -
  • -
-
-
-

Our position is that the ISA is perhaps the most important interface in -a computing system, and there is no reason that such an important -interface should be proprietary. The dominant commercial ISAs are based -on instruction-set concepts that were already well known over 30 years -ago. Software developers should be able to target an open standard -hardware target, and commercial processor designers should compete on -implementation quality.

-
-
-

We are far from the first to contemplate an open ISA design suitable for -hardware implementation. We also considered other existing open ISA -designs, of which the closest to our goals was the OpenRISC -architecture (OpenCores, 2012). We decided against adopting the OpenRISC ISA for several -technical reasons:

-
-
-
    -
  • -

    OpenRISC has condition codes and branch delay slots, which complicate -higher performance implementations.

    -
  • -
  • -

    OpenRISC uses a fixed 32-bit encoding and 16-bit immediates, which -precludes a denser instruction encoding and limits space for later -expansion of the ISA.

    -
  • -
  • -

    OpenRISC does not support the 2008 revision to the IEEE 754 -floating-point standard.

    -
  • -
  • -

    The OpenRISC 64-bit design had not been completed when we began.

    -
  • -
-
-
-

By starting from a clean slate, we could design an ISA that met all of -our goals, though of course, this took far more effort than we had -planned at the outset. We have now invested considerable effort in -building up the RISC-V ISA infrastructure, including documentation, -compiler tool chains, operating system ports, reference ISA simulators, -FPGA implementations, efficient ASIC implementations, architecture test -suites, and teaching materials. Since the last edition of this manual, -there has been considerable uptake of the RISC-V ISA in both academia -and industry, and we have created the non-profit RISC-V Foundation to -protect and promote the standard. The RISC-V Foundation website at -riscv.org contains the latest information on the Foundation -membership and various open-source projects using RISC-V.

-
-
-
-

40.2. History from Revision 1.0 of ISA manual

-
-

The RISC-V ISA and instruction-set manual builds upon several earlier -projects. Several aspects of the supervisor-level machine and the -overall format of the manual date back to the T0 (Torrent-0) vector -microprocessor project at UC Berkeley and ICSI, begun in 1992. T0 was a -vector processor based on the MIPS-II ISA, with Krste Asanović as main -architect and RTL designer, and Brian Kingsbury and Bertrand Irrisou as -principal VLSI implementors. David Johnson at ICSI was a major -contributor to the T0 ISA design, particularly supervisor mode, and to -the manual text. John Hauser also provided considerable feedback on the -T0 ISA design.

-
-
-

The Scale (Software-Controlled Architecture for Low Energy) project at -MIT, begun in 2000, built upon the T0 project infrastructure, refined -the supervisor-level interface, and moved away from the MIPS scalar ISA -by dropping the branch delay slot. Ronny Krashinsky and Christopher -Batten were the principal architects of the Scale Vector-Thread -processor at MIT, while Mark Hampton ported the GCC-based compiler -infrastructure and tools for Scale.

-
-
-

A lightly edited version of the T0 MIPS scalar processor specification -(MIPS-6371) was used in teaching a new version of the MIT 6.371 -Introduction to VLSI Systems class in the Fall 2002 semester, with Chris -Terman and Krste Asanović as lecturers. Chris Terman contributed most of -the lab material for the class (there was no TA!). The 6.371 class -evolved into the trial 6.884 Complex Digital Design class at MIT, taught -by Arvind and Krste Asanović in Spring 2005, which became a regular -Spring class 6.375. A reduced version of the Scale MIPS-based scalar -ISA, named SMIPS, was used in 6.884/6.375. Christopher Batten was the TA -for the early offerings of these classes and developed a considerable -amount of documentation and lab material based around the SMIPS ISA. -This same SMIPS lab material was adapted and enhanced by TA Yunsup Lee -for the UC Berkeley Fall 2009 CS250 VLSI Systems Design class taught by -John Wawrzynek, Krste Asanović, and John Lazzaro.

-
-
-

The Maven (Malleable Array of Vector-thread ENgines) project was a -second-generation vector-thread architecture. Its design was led by -Christopher Batten when he was an Exchange Scholar at UC Berkeley -starting in summer 2007. Hidetaka Aoki, a visiting industrial fellow -from Hitachi, gave considerable feedback on the early Maven ISA and -microarchitecture design. The Maven infrastructure was based on the -Scale infrastructure but the Maven ISA moved further away from the MIPS -ISA variant defined in Scale, with a unified floating-point and integer -register file. Maven was designed to support experimentation with -alternative data-parallel accelerators. Yunsup Lee was the main -implementor of the various Maven vector units, while Rimas Avižienis was -the main implementor of the various Maven scalar units. Yunsup Lee and -Christopher Batten ported GCC to work with the new Maven ISA. -Christopher Celio provided the initial definition of a traditional -vector instruction set ("Flood") variant of Maven.

-
-
-

Based on experience with all these previous projects, the RISC-V ISA -definition was begun in Summer 2010, with Andrew Waterman, Yunsup Lee, -Krste Asanović, and David Patterson as principal designers. An initial -version of the RISC-V 32-bit instruction subset was used in the UC -Berkeley Fall 2010 CS250 VLSI Systems Design class, with Yunsup Lee as -TA. RISC-V is a clean break from the earlier MIPS-inspired designs. John -Hauser contributed to the floating-point ISA definition, including the -sign-injection instructions and a register encoding scheme that permits -internal recoding of floating-point values.

-
-
-
-

40.3. History from Revision 2.0 of ISA manual

-
-

Multiple implementations of RISC-V processors have been completed, -including several silicon fabrications, as shown in -Fabricated RISC-V testchips table.

-
- ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
NameTapeout DateProcessISA

Raven-1

May 29, 2011

ST 28nm FDSOI

RV64G1_Xhwacha1

EOS14

April 1, 2012

IBM 45nm SOI

RV64G1p1_Xhwacha2

EOS16

August 17, 2012

IBM 45nm SOI

RV64G1p1_Xhwacha2

Raven-2

August 22, 2012

ST 28nm FDSOI

RV64G1p1_Xhwacha2

EOS18

February 6, 2013

IBM 45nm SOI

RV64G1p1_Xhwacha2

EOS20

July 3, 2013

IBM 45nm SOI

RV64G1p99_Xhwacha2

Raven-3

September 26, 2013

ST 28nm SOI

RV64G1p99_Xhwacha2

EOS22

March 7, 2014

IBM 45nm SOI

RV64G1p9999_Xhwacha3

-
-

The first RISC-V processors to be fabricated were written in Verilog and -manufactured in a pre-production FDSOI technology from ST as the Raven-1 -testchip in 2011. Two cores were developed by Yunsup Lee and Andrew -Waterman, advised by Krste Asanović, and fabricated together: 1) an RV64 -scalar core with error-detecting flip-flops, and 2) an RV64 core with an -attached 64-bit floating-point vector unit. The first microarchitecture -was informally known as "TrainWreck", due to the short time available -to complete the design with immature design libraries.

-
-
-

Subsequently, a clean microarchitecture for an in-order decoupled RV64 -core was developed by Andrew Waterman, Rimas Avižienis, and Yunsup Lee, -advised by Krste Asanović, and, continuing the railway theme, was -codenamed "Rocket" after George Stephenson’s successful steam -locomotive design. Rocket was written in Chisel, a new hardware design -language developed at UC Berkeley. The IEEE floating-point units used in -Rocket were developed by John Hauser, Andrew Waterman, and Brian -Richards. Rocket has since been refined and developed further, and has -been fabricated two more times in FDSOI (Raven-2, Raven-3), and five -times in IBM SOI technology (EOS14, EOS16, EOS18, EOS20, EOS22) for a -photonics project. Work is ongoing to make the Rocket design available -as a parameterized RISC-V processor generator.

-
-
-

EOS14-EOS22 chips include early versions of Hwacha, a 64-bit IEEE -floating-point vector unit, developed by Yunsup Lee, Andrew Waterman, -Huy Vo, Albert Ou, Quan Nguyen, and Stephen Twigg, advised by Krste -Asanović. EOS16-EOS22 chips include dual cores with a cache-coherence -protocol developed by Henry Cook and Andrew Waterman, advised by Krste -Asanović. EOS14 silicon has successfully run at 1.25 GHz. EOS16 silicon suffered -from a bug in the IBM pad libraries. EOS18 and EOS20 have successfully -run at 1.35 GHz.

-
-
-

Contributors to the Raven testchips include Yunsup Lee, Andrew Waterman, -Rimas Avižienis, Brian Zimmer, Jaehwa Kwak, Ruzica Jevtić, Milovan -Blagojević, Alberto Puggelli, Steven Bailey, Ben Keller, Pi-Feng Chiu, -Brian Richards, Borivoje Nikolić, and Krste Asanović.

-
-
-

Contributors to the EOS testchips include Yunsup Lee, Rimas Avižienis, -Andrew Waterman, Henry Cook, Huy Vo, Daiwei Li, Chen Sun, Albert Ou, -Quan Nguyen, Stephen Twigg, Vladimir Stojanović, and Krste Asanović.

-
-
-

Andrew Waterman and Yunsup Lee developed the C++ ISA simulator -"Spike", used as a golden model in development and named after the -golden spike used to celebrate completion of the US transcontinental -railway. Spike has been made available as a BSD open-source project.

-
-
-

Andrew Waterman completed a Master’s thesis with a preliminary design of -the RISC-V compressed instruction set (Waterman, 2011).

-
-
-

Various FPGA implementations of the RISC-V have been completed, -primarily as part of integrated demos for the Par Lab project research -retreats. The largest FPGA design has 3 cache-coherent RV64IMA -processors running a research operating system. Contributors to the FPGA -implementations include Andrew Waterman, Yunsup Lee, Rimas Avižienis, -and Krste Asanović.

-
-
-

RISC-V processors have been used in several classes at UC Berkeley. -Rocket was used in the Fall 2011 offering of CS250 as a basis for class -projects, with Brian Zimmer as TA. For the undergraduate CS152 class in -Spring 2012, Christopher Celio used Chisel to write a suite of -educational RV32 processors, named "Sodor" after the island on which -"Thomas the Tank Engine" and friends live. The suite includes a -microcoded core, an unpipelined core, and 2, 3, and 5-stage pipelined -cores, and is publicly available under a BSD license. The suite was -subsequently updated and used again in CS152 in Spring 2013, with Yunsup -Lee as TA, and in Spring 2014, with Eric Love as TA. Christopher Celio -also developed an out-of-order RV64 design known as BOOM (Berkeley -Out-of-Order Machine), with accompanying pipeline visualizations, that -was used in the CS152 classes. The CS152 classes also used -cache-coherent versions of the Rocket core developed by Andrew Waterman -and Henry Cook.

-
-
-

Over the summer of 2013, the RoCC (Rocket Custom Coprocessor) interface -was defined to simplify adding custom accelerators to the Rocket core. -Rocket and the RoCC interface were used extensively in the Fall 2013 -CS250 VLSI class taught by Jonathan Bachrach, with several student -accelerator projects built to the RoCC interface. The Hwacha vector unit -has been rewritten as a RoCC coprocessor.

-
-
-

Two Berkeley undergraduates, Quan Nguyen and Albert Ou, have -successfully ported Linux to run on RISC-V in Spring 2013.

-
-
-

Colin Schmidt successfully completed an LLVM backend for RISC-V 2.0 in -January 2014.

-
-
-

Darius Rad at Bluespec contributed soft-float ABI support to the GCC -port in March 2014.

-
-
-

John Hauser contributed the definition of the floating-point -classification instructions.

-
-
-

We are aware of several other RISC-V core implementations, including one -in Verilog by Tommy Thorn, and one in Bluespec by Rishiyur Nikhil.

-
-
-
-

40.4. Acknowledgments

-
-

Thanks to Christopher F. Batten, Preston Briggs, Christopher Celio, -David Chisnall, Stefan Freudenberger, John Hauser, Ben Keller, Rishiyur -Nikhil, Michael Taylor, Tommy Thorn, and Robert Watson for comments on -the draft ISA version 2.0 specification.

-
-
-
-

40.5. History from Revision 2.1

-
-

Uptake of the RISC-V ISA has been very rapid since the introduction of -the frozen version 2.0 in May 2014, with too much activity to record in -a short history section such as this. Perhaps the most important single -event was the formation of the non-profit RISC-V Foundation in August -2015. The Foundation will now take over stewardship of the official -RISC-V ISA standard, and the official website riscv.org is the best -place to obtain news and updates on the RISC-V standard.

-
-
-
-

40.6. Acknowledgments

-
-

Thanks to Scott Beamer, Allen J. Baum, Christopher Celio, David -Chisnall, Paul Clayton, Palmer Dabbelt, Jan Gray, Michael Hamburg, and -John Hauser for comments on the version 2.0 specification.

-
-
-
-

40.7. History from Revision 2.2

- -
-
-

40.8. Acknowledgments

-
-

Thanks to Jacob Bachmeyer, Alex Bradbury, David Horner, Stefan O’Rear, -and Joseph Myers for comments on the version 2.1 specification.

-
-
-
-

40.9. History for Revision 2.3

-
-

Uptake of RISC-V continues at a breakneck pace.

-
-
-

John Hauser and Andrew Waterman contributed a hypervisor ISA extension -based upon a proposal from Paolo Bonzini.

-
-
-

Daniel Lustig, Arvind, Krste Asanović, Shaked Flur, Paul Loewenstein, -Yatin Manerkar, Luc Maranget, Margaret Martonosi, Vijayanand Nagarajan, -Rishiyur Nikhil, Jonas Oberhauser, Christopher Pulte, Jose Renau, Peter -Sewell, Susmit Sarkar, Caroline Trippel, Muralidaran Vijayaraghavan, -Andrew Waterman, Derek Williams, Andrew Wright, and Sizhuo Zhang -contributed the memory consistency model.

-
-
-
-

40.10. Funding

-
-

Development of the RISC-V architecture and implementations has been -partially funded by the following sponsors.

-
-
-
    -
  • -

    Par Lab: Research supported by Microsoft (Award # 024263) and Intel -(Award # 024894) funding and by matching funding by U.C. Discovery (Award -# DIG07-10227). Additional support came from Par Lab affiliates Nokia, -NVIDIA, Oracle, and Samsung.

    -
  • -
  • -

    Project Isis: DoE Award DE-SC0003624.

    -
  • -
  • -

    ASPIRE Lab: DARPA PERFECT program, Award HR0011-12-2-0016. DARPA -POEM program Award HR0011-11-C-0100. The Center for Future Architectures -Research (C-FAR), a STARnet center funded by the Semiconductor Research -Corporation. Additional support from ASPIRE industrial sponsor, Intel, -and ASPIRE affiliates, Google, Hewlett Packard Enterprise, Huawei, -Nokia, NVIDIA, Oracle, and Samsung.

    -
  • -
-
-
-

The content of this paper does not necessarily reflect the position or -the policy of the US government and no official endorsement should be -inferred.

-
-
-
-
-
-

Appendix A: RVWMO Explanatory Material, Version 0.1

-
-
-

This section provides more explanation for RVWMO -Chapter 18, using more informal -language and concrete examples. These are intended to clarify the -meaning and intent of the axioms and preserved program order rules. This -appendix should be treated as commentary; all normative material is -provided in Chapter 18 and in the rest of -the main body of the ISA specification. All currently known -discrepancies are listed in Section A.7. Any -other discrepancies are unintentional.

-
-
-

A.1. Why RVWMO?

-
-

Memory consistency models fall along a loose spectrum from weak to -strong. Weak memory models allow more hardware implementation -flexibility and deliver arguably better performance, performance per -watt, power, scalability, and hardware verification overheads than -strong models, at the expense of a more complex programming model. -Strong models provide simpler programming models, but at the cost of -imposing more restrictions on the kinds of (non-speculative) hardware -optimizations that can be performed in the pipeline and in the memory -system, and in turn imposing some cost in terms of power, area overhead, -and verification burden.

-
-
-

RISC-V has chosen the RVWMO memory model, a variant of release -consistency. This places it in between the two extremes of the memory -model spectrum. The RVWMO memory model enables architects to build -simple implementations, aggressive implementations, implementations -embedded deeply inside a much larger system and subject to complex -memory system interactions, or any number of other possibilities, all -while simultaneously being strong enough to support programming language -memory models at high performance.

-
-
-

To facilitate the porting of code from other architectures, some -hardware implementations may choose to implement the Ztso extension, -which provides stricter RVTSO ordering semantics by default. Code -written for RVWMO is automatically and inherently compatible with RVTSO, -but code written assuming RVTSO is not guaranteed to run correctly on -RVWMO implementations. In fact, most RVWMO implementations will (and -should) simply refuse to run RVTSO-only binaries. Each implementation -must therefore choose whether to prioritize compatibility with RVTSO -code (e.g., to facilitate porting from x86) or whether to instead -prioritize compatibility with other RISC-V cores implementing RVWMO.

-
-
-

Some fences and/or memory ordering annotations in code written for RVWMO -may become redundant under RVTSO; the cost that the default of RVWMO -imposes on Ztso implementations is the incremental overhead of fetching -those fences (e.g., FENCE R,RW and FENCE RW,W) which become no-ops on -that implementation. However, these fences must remain present in the -code if compatibility with non-Ztso implementations is desired.

-
-
-
-

A.2. Litmus Tests

-
-

The explanations in this chapter make use of litmus tests, or small -programs designed to test or highlight one particular aspect of a memory -model. Litmus sample shows an example -of a litmus test with two harts. As a convention for this figure and for -all figures that follow in this chapter, we assume that s0-s2 are -pre-set to the same value in all harts and that s0 holds the address -labeled x, s1 holds y, and s2 holds z, where x, y, and z -are disjoint memory locations aligned to 8 byte boundaries. All other registers and all referenced memory locations are presumed to be initialized to zero. Each figure -shows the litmus test code on the left, and a visualization of one -particular valid or invalid execution on the right.

-
- - ---- - - - - - - -
Table 32. A sample litmus test and one forbidden execution (a0=1).
------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Hart 0Hart 1

li t1,1

li t4,4

(a)

sw t1,0(s0)

(e)

sw t4,0(s0)

li t2,2

(b)

sw t2,0(s0)

(c)

lw a0,0(s0)

li t3,3

li t5,5

(d)

sw t3,0(s0)

(f)

sw t5,0(s0)

--- - - - - - -
-
-litmus sample -
-
-
-

Litmus tests are used to understand the implications of the memory model -in specific concrete situations. For example, in the litmus test of -Litmus sample, the final value of a0 -in the first hart can be either 2, 4, or 5, depending on the dynamic -interleaving of the instruction stream from each hart at runtime. -However, in this example, the final value of a0 in Hart 0 will never -be 1 or 3; intuitively, the value 1 will no longer be visible at the -time the load executes, and the value 3 will not yet be visible by the -time the load executes. We analyze this test and many others below.

-
-
- - ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 33. A key for the litmus test diagrams drawn in this appendix
EdgeFull Name (and explanation)

rf

Reads From (from each store to the loads that return a value -written by that store)

co

Coherence (a total order on the stores to each address)

fr

From-Reads (from each load to co-successors of the store from which -the load returned a value)

ppo

Preserved Program Order

fence

Orderings enforced by a FENCE instruction

addr

Address Dependency

ctrl

Control Dependency

data

Data Dependency

-
-

The diagram shown to the right of each litmus test shows a visual -representation of the particular execution candidate being considered. -These diagrams use a notation that is common in the memory model -literature for constraining the set of possible global memory orders -that could produce the execution in question. It is also the basis for -the herd models presented in -Section B.2. This notation is explained in -Table 33. Of the listed relations, rf edges between -harts, co edges, fr edges, and ppo edges directly constrain the global -memory order (as do fence, addr, data, and some ctrl edges, via ppo). -Other edges (such as intra-hart rf edges) are informative but do not -constrain the global memory order.

-
-
-

For example, in Litmus sample, a0=1 -could occur only if one of the following were true:

-
-
-
    -
  • -

    (b) appears before (a) in global memory order (and in the -coherence order co). However, this violates RVWMO PPO -rule ppo:→st. The co edge from (b) to (a) highlights this -contradiction.

    -
  • -
  • -

    (a) appears before (b) in global memory order (and in the -coherence order co). However, in this case, the Load Value Axiom would -be violated, because (a) is not the latest matching store prior to (c) -in program order. The fr edge from (c) to (b) highlights this -contradiction.

    -
  • -
-
-
-

Since neither of these scenarios satisfies the RVWMO axioms, the outcome -a0=1 is forbidden.

-
-
-

Beyond what is described in this appendix, a suite of more than seven -thousand litmus tests is available at -github.com/litmus-tests/litmus-tests-riscv.

-
-
- - - - - -
- - -
-

The litmus tests repository also provides instructions on how to run the -litmus tests on RISC-V hardware and how to compare the results with the -operational and axiomatic models.

-
-
-

In the future, we expect to adapt these memory model litmus tests for -use as part of the RISC-V compliance test suite as well.

-
-
-
-
-
-

A.3. Explaining the RVWMO Rules

-
-

In this section, we provide explanation and examples for all of the -RVWMO rules and axioms.

-
-
-

A.3.1. Preserved Program Order and Global Memory Order

-
-

Preserved program order represents the subset of program order that must -be respected within the global memory order. Conceptually, events from -the same hart that are ordered by preserved program order must appear in -that order from the perspective of other harts and/or observers. Events -from the same hart that are not ordered by preserved program order, on -the other hand, may appear reordered from the perspective of other harts -and/or observers.

-
-
-

Informally, the global memory order represents the order in which loads -and stores perform. The formal memory model literature has moved away -from specifications built around the concept of performing, but the idea -is still useful for building up informal intuition. A load is said to -have performed when its return value is determined. A store is said to -have performed not when it has executed inside the pipeline, but rather -only when its value has been propagated to globally visible memory. In -this sense, the global memory order also represents the contribution of -the coherence protocol and/or the rest of the memory system to -interleave the (possibly reordered) memory accesses being issued by each -hart into a single total order agreed upon by all harts.

-
-
-

The order in which loads perform does not always directly correspond to -the relative age of the values those two loads return. In particular, a -load b may perform before another load a to -the same address (i.e., b may execute before -a, and b may appear before a -in the global memory order), but a may nevertheless return -an older value than b. This discrepancy captures (among -other things) the reordering effects of buffering placed between the -core and memory. For example, b may have returned a value -from a store in the store buffer, while a may have ignored -that younger store and read an older value from memory instead. To -account for this, at the time each load performs, the value it returns -is determined by the load value axiom, not just strictly by determining -the most recent store to the same address in the global memory order, as -described below.

-
-
-
-

A.3.2. Load value axiom

-
- - - - - -
- - -
-

Section 18.1.4.1: Each byte of each load i returns the value written -to that byte by the store that is the latest in global memory order among -the following stores:

-
-
-
    -
  1. -

    Stores that write that byte and that precede i in the global memory -order

    -
  2. -
  3. -

    Stores that write that byte and that precede i in program order

    -
  4. -
-
-
-
-
-

Preserved program order is not required to respect the ordering of a -store followed by a load to an overlapping address. This complexity -arises due to the ubiquity of store buffers in nearly all -implementations. Informally, the load may perform (return a value) by -forwarding from the store while the store is still in the store buffer, -and hence before the store itself performs (writes back to globally -visible memory). Any other hart will therefore observe the load as -performing before the store.

-
-
-

Consider the Table 34. When running this program on an implementation with -store buffers, it is possible to arrive at the final outcome a0=1, a1=0, a2=1, a3=0 as follows:

-
- - ---- - - - - - - -
Table 34. A store buffer forwarding litmus test (outcome permitted)
------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Hart 0Hart 1

li t1, 1

li t1, 1

(a) sw t1,0(s0)

(e) sw t1,0(s1)

(b) lw a0,0(s0)

(f) lw a2,0(s1)

(c) fence r,r

(g) fence r,r

(d) lw a1,0(s1)

(h) lw a3,0(s0)

Outcome: a0=1, a1=0, a2=1, a3=0

--- - - - - - -
-
-litmus sb fwd -
-
-
-
    -
  • -

    (a) executes and enters the first hart’s private store buffer

    -
  • -
  • -

    (b) executes and forwards its return value 1 from (a) in the -store buffer

    -
  • -
  • -

    (c) executes since all previous loads (i.e., (b)) have -completed

    -
  • -
  • -

    (d) executes and reads the value 0 from memory

    -
  • -
  • -

    (e) executes and enters the second hart’s private store buffer

    -
  • -
  • -

    (f) executes and forwards its return value 1 from (e) in the -store buffer

    -
  • -
  • -

    (g) executes since all previous loads (i.e., (f)) have -completed

    -
  • -
  • -

    (h) executes and reads the value 0 from memory

    -
  • -
  • -

    (a) drains from the first hart’s store buffer to memory

    -
  • -
  • -

    (e) drains from the second hart’s store buffer to memory

    -
  • -
-
-
-

Therefore, the memory model must be able to account for this behavior.

-
-
-

To put it another way, suppose the definition of preserved program order -did include the following hypothetical rule: memory access -a precedes memory access b in preserved -program order (and hence also in the global memory order) if -a precedes b in program order and -a and b are accesses to the same memory -location, a is a write, and b is a read. -Call this "Rule X". Then we get the following:

-
-
-
    -
  • -

    (a) precedes (b): by rule X

    -
  • -
  • -

    (b) precedes (d): by rule 4

    -
  • -
  • -

    (d) precedes (e): by the load value axiom. Otherwise, if (e) -preceded (d), then (d) would be required to return the value 1. (This is -a perfectly legal execution; it’s just not the one in question)

    -
  • -
  • -

    (e) precedes (f): by rule X

    -
  • -
  • -

    (f) precedes (h): by rule 4]

    -
  • -
  • -

    (h) precedes (a): by the load value axiom, as above.

    -
  • -
-
-
-

The global memory order must be a total order and cannot be cyclic, -because a cycle would imply that every event in the cycle happens before -itself, which is impossible. Therefore, the execution proposed above -would be forbidden, and hence the addition of rule X would forbid -implementations with store buffer forwarding, which would clearly be -undesirable.

-
-
-

Nevertheless, even if (b) precedes (a) and/or (f) precedes (e) in the -global memory order, the only sensible possibility in this example is -for (b) to return the value written by (a), and likewise for (f) and -(e). This combination of circumstances is what leads to the second -option in the definition of the load value axiom. Even though (b) -precedes (a) in the global memory order, (a) will still be visible to -(b) by virtue of sitting in the store buffer at the time (b) executes. -Therefore, even if (b) precedes (a) in the global memory order, (b) -should return the value written by (a) because (a) precedes (b) in -program order. Likewise for (e) and (f).

-
- - ---- - - - - - - -
Table 35. The "PPOCA" store buffer forwarding litmus test (outcome permitted)
------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Hart 0Hart 1

li t1, 1

li t1, 1

(a)

sw t1,0(s0)

LOOP:

(b)

fence w,w

(d)

lw a0,0(s1)

(c)

sw t1,0(s1)

beqz a0, LOOP

(e)

sw t1,0(s2)

(f)

lw a1,0(s2)

xor a2,a1,a1

add s0,s0,a2

(g)

lw a2,0(s0)

Outcome: a0=1, a1=1, a2=0

--- - - - - - -
-
-litmus ppoca -
-
-
-

Another test that highlights the behavior of store buffers is shown in -Table 35. In this example, (d) is -ordered before (e) because of the control dependency, and (f) is ordered -before (g) because of the address dependency. However, (e) is not -necessarily ordered before (f), even though (f) returns the value -written by (e). This could correspond to the following sequence of -events:

-
-
-
    -
  • -

    (e) executes speculatively and enters the second hart’s private -store buffer (but does not drain to memory)

    -
  • -
  • -

    (f) executes speculatively and forwards its return value 1 from -(e) in the store buffer

    -
  • -
  • -

    (g) executes speculatively and reads the value 0 from memory

    -
  • -
  • -

    (a) executes, enters the first hart’s private store buffer, and -drains to memory

    -
  • -
  • -

    (b) executes and retires

    -
  • -
  • -

    (c) executes, enters the first hart’s private store buffer, and -drains to memory

    -
  • -
  • -

    (d) executes and reads the value 1 from memory

    -
  • -
  • -

    (e), (f), and (g) commit, since the speculation turned out to be -correct

    -
  • -
  • -

    (e) drains from the store buffer to memory

    -
  • -
-
-
-
-

A.3.3. Atomicity axiom

-
- - - - - -
- - -
-

Atomicity Axiom (for Aligned Atomics): If r and w are paired load and -store operations generated by aligned LR and SC instructions in a hart -h, s is a store to byte x, and r returns a value written by s, then s must -precede w in the global memory order, and there can be no store from -a hart other than h to byte x following s and preceding w in the global -memory order.

-
-
-
-
-

The RISC-V architecture decouples the notion of atomicity from the -notion of ordering. Unlike architectures such as TSO, RISC-V atomics -under RVWMO do not impose any ordering requirements by default. Ordering -semantics are only guaranteed by the PPO rules that otherwise apply.

-
-
-

RISC-V contains two types of atomics: AMOs and LR/SC pairs. These -conceptually behave differently, in the following way. LR/SC behave as -if the old value is brought up to the core, modified, and written back -to memory, all while a reservation is held on that memory location. AMOs -on the other hand conceptually behave as if they are performed directly -in memory. AMOs are therefore inherently atomic, while LR/SC pairs are -atomic in the slightly different sense that the memory location in -question will not be modified by another hart during the time the -original hart holds the reservation.

-
- ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
(a) lr.d a0, 0(s0)(a) lr.d a0, 0(s0)(a) lr.w a0, 0(s0)(a) lr.w a0, 0(s0)

(b) sd t1, 0(s0)

(b) sw t1, 4(s0)

(b) sw t1, 4(s0)

(b) sw t1, 4(s0)

(c) sc.d t3, t2, 0(s0)

(c) sc.d t3, t2, 0(s0)

(c) sc.w t3, t2, 0(s0)

(c) addi s0, s0, 8

(d) sc.w t3, t2, 8(s0)

-
-

Figure 4: In all four (independent) instances, the final store-conditional instruction is permitted but not guaranteed to succeed.

-
-
-

The atomicity axiom forbids stores from other harts from being -interleaved in global memory order between an LR and the SC paired with -that LR. The atomicity axiom does not forbid loads from being -interleaved between the paired operations in program order or in the -global memory order, nor does it forbid stores from the same hart or -stores to non-overlapping locations from appearing between the paired -operations in either program order or in the global memory order. For -example, the SC instructions in [litmus_lrsdsc] may (but are not -guaranteed to) succeed. None of those successes would violate the -atomicity axiom, because the intervening non-conditional stores are from -the same hart as the paired load-reserved and store-conditional -instructions. This way, a memory system that tracks memory accesses at -cache line granularity (and which therefore will see the four snippets -of [litmus_lrsdsc] as identical) will not -be forced to fail a store-conditional instruction that happens to -(falsely) share another portion of the same cache line as the memory -location being held by the reservation.

-
-
-

The atomicity axiom also technically supports cases in which the LR and -SC touch different addresses and/or use different access sizes; however, -use cases for such behaviors are expected to be rare in practice. -Likewise, scenarios in which stores from the same hart between an LR/SC -pair actually overlap the memory location(s) referenced by the LR or SC -are expected to be rare compared to scenarios where the intervening -store may simply fall onto the same cache line.

-
-
-
-

A.3.4. Progress axiom

-
- - - - - -
- - -
-

Progress Axiom: No memory operation may be preceded in the global -memory order by an infinite sequence of other memory operations.

-
-
-
-
-

The progress axiom ensures a minimal forward progress guarantee. It -ensures that stores from one hart will eventually be made visible to -other harts in the system in a finite amount of time, and that loads -from other harts will eventually be able to read those values (or -successors thereof). Without this rule, it would be legal, for example, -for a spinlock to spin infinitely on a value, even with a store from -another hart waiting to unlock the spinlock.

-
-
-

The progress axiom is intended not to impose any other notion of -fairness, latency, or quality of service onto the harts in a RISC-V -implementation. Any stronger notions of fairness are up to the rest of -the ISA and/or up to the platform and/or device to define and implement.

-
-
-

The forward progress axiom will in almost all cases be naturally -satisfied by any standard cache coherence protocol. Implementations with -non-coherent caches may have to provide some other mechanism to ensure -the eventual visibility of all stores (or successors thereof) to all -harts.

-
-
-
-

A.3.5. Overlapping-Address Orderings (Rules 1-3)

-
- - - - - -
- - -
-

Rule 1: b is a store, and a and b access overlapping memory addresses

-
-
-

Rule 2: a and b are loads, x is a byte read by both a and b, there is no -store to x between a and b in program order, and a and b return values -for x written by different memory operations

-
-
-

Rule 3: a is generated by an AMO or SC instruction, b is a load, and b -returns a value written by a

-
-
-
-
-

Same-address orderings where the latter is a store are straightforward: -a load or store can never be reordered with a later store to an -overlapping memory location. From a microarchitecture perspective, -generally speaking, it is difficult or impossible to undo a -speculatively reordered store if the speculation turns out to be -invalid, so such behavior is simply disallowed by the model. -Same-address orderings from a store to a later load, on the other hand, -do not need to be enforced. As discussed in -Load value axiom, this reflects the observable -behavior of implementations that forward values from buffered stores to -later loads.

-
-
-

Same-address load-load ordering requirements are far more subtle. The -basic requirement is that a younger load must not return a value that is -older than a value returned by an older load in the same hart to the -same address. This is often known as "CoRR" (Coherence for Read-Read -pairs), or as part of a broader "coherence" or "sequential -consistency per location" requirement. Some architectures in the past -have relaxed same-address load-load ordering, but in hindsight this is -generally considered to complicate the programming model too much, and -so RVWMO requires CoRR ordering to be enforced. However, because the -global memory order corresponds to the order in which loads perform -rather than the ordering of the values being returned, capturing CoRR -requirements in terms of the global memory order requires a bit of -indirection.

-
- - ---- - - - - - - -
Table 36. Litmus test MP+fence.w.w+fre-rfi-addr (outcome permitted)
------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Hart 0Hart 1

li t1, 1

li t2, 2

(a)

sw t1,0(s0)

(d)

lw a0,0(s1)

(b)

fence w, w

(e)

sw t2,0(s1)

(c)

sw t1,0(s1)

(f)

lw a1,0(s1)

(g)

xor t3,a1,a1

(h)

add s0,s0,t3

(i)

lw a2,0(s0)

Outcome: a0=1, a1=2, a2=0

--- - - - - - -
-
-litmus mp fenceww fri rfi addr -
-
-
-

Consider the litmus test of Table 36, which is one particular -instance of the more general "fri-rfi" pattern. The term "fri-rfi" -refers to the sequence (d), (e), (f): (d) "from-reads" (i.e., reads -from an earlier write than) (e) which is the same hart, and (f) reads -from (e) which is in the same hart.

-
-
-

From a microarchitectural perspective, outcome a0=1, a1=2, a2=0 is -legal (as are various other less subtle outcomes). Intuitively, the -following would produce the outcome in question:

-
-
-
    -
  • -

    (d) stalls (for whatever reason; perhaps it’s stalled waiting -for some other preceding instruction)

    -
  • -
  • -

    (e) executes and enters the store buffer (but does not yet -drain to memory)

    -
  • -
  • -

    (f) executes and forwards from (e) in the store buffer

    -
  • -
  • -

    (g), (h), and (i) execute

    -
  • -
  • -

    (a) executes and drains to memory, (b) executes, and (c) -executes and drains to memory

    -
  • -
  • -

    (d) unstalls and executes

    -
  • -
  • -

    (e) drains from the store buffer to memory

    -
  • -
-
-
-

This corresponds to a global memory order of (f), (i), (a), (c), (d), -(e). Note that even though (f) performs before (d), the value returned -by (f) is newer than the value returned by (d). Therefore, this -execution is legal and does not violate the CoRR requirements.

-
-
-

Likewise, if two back-to-back loads return the values written by the -same store, then they may also appear out-of-order in the global memory -order without violating CoRR. Note that this is not the same as saying -that the two loads return the same value, since two different stores may -write the same value.

-
- - ---- - - - - - - -
Table 37. Litmus test RSW (outcome permitted)
------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Hart 0Hart 1

li t1, 1

(d)

lw a0,0(s1)

(a)

sw t1,0(s0)

(e)

xor t2,a0,a0

(b)

fence w, w

(f)

add s4,s2,t2

(c)

sw t1,0(s1)

(g)

lw a1,0(s4)

(h)

lw a2,0(s2)

(i)

xor t3,a2,a2

(j)

add s0,s0,t3

(k)

lw a3,0(s0)

Outcome: a0=1, a1=v, a2=v, a3=0

--- - - - - - -
-
-litmus rsw -
-
-
-

Consider the litmus test of Table 37. -The outcome a0=1, a1=v, a2=v, a3=0 (where v is -some value written by another hart) can be observed by allowing (g) and -(h) to be reordered. This might be done speculatively, and the -speculation can be justified by the microarchitecture (e.g., by snooping -for cache invalidations and finding none) because replaying (h) after -(g) would return the value written by the same store anyway. Hence -assuming a1 and a2 would end up with the same value written by the -same store anyway, (g) and (h) can be legally reordered. The global -memory order corresponding to this execution would be -(h),(k),(a),(c),(d),(g).

-
-
-

Executions of the test in Table 37 in -which a1 does not equal a2 do in fact require that (g) appears -before (h) in the global memory order. Allowing (h) to appear before (g) -in the global memory order would in that case result in a violation of -CoRR, because then (h) would return an older value than that returned by -(g). Therefore, rule 2 forbids this CoRR violation -from occurring. As such, rule 2 strikes a careful -balance between enforcing CoRR in all cases while simultaneously being -weak enough to permit "RSW" and "fri-rfi" patterns that commonly -appear in real microarchitectures.

-
-
-

There is one more overlapping-address rule: rule 3 simply states that a value cannot -be returned from an AMO or SC to a subsequent load until the AMO or SC -has (in the case of the SC, successfully) performed globally. This -follows somewhat naturally from the conceptual view that both AMOs and -SC instructions are meant to be performed atomically in memory. However, -notably, rule 3 states that hardware -may not even non-speculatively forward the value being stored by an -AMOSWAP to a subsequent load, even though for AMOSWAP that store value -is not actually semantically dependent on the previous value in memory, -as is the case for the other AMOs. The same holds true even when -forwarding from SC store values that are not semantically dependent on -the value returned by the paired LR.

-
-
-

The three PPO rules above also apply when the memory accesses in -question only overlap partially. This can occur, for example, when -accesses of different sizes are used to access the same object. Note -also that the base addresses of two overlapping memory operations need -not necessarily be the same for two memory accesses to overlap. When -misaligned memory accesses are being used, the overlapping-address PPO -rules apply to each of the component memory accesses independently.

-
-
-
-

A.3.6. Fences (Rule 4)

-
- - - - - -
- - -
-

Rule 4: There is a FENCE instruction that orders a before b

-
-
-
-
-

By default, the FENCE instruction ensures that all memory accesses from -instructions preceding the fence in program order (the "predecessor -set") appear earlier in the global memory order than memory accesses -from instructions appearing after the fence in program order (the -"successor set"). However, fences can optionally further restrict the -predecessor set and/or the successor set to a smaller set of memory -accesses in order to provide some speedup. Specifically, fences have PR, -PW, SR, and SW bits which restrict the predecessor and/or successor -sets. The predecessor set includes loads (resp.stores) if and only if PR -(resp.PW) is set. Similarly, the successor set includes loads -(resp.stores) if and only if SR (resp.SW) is set.

-
-
-

The FENCE encoding currently has nine non-trivial combinations of the -four bits PR, PW, SR, and SW, plus one extra encoding FENCE.TSO which -facilitates mapping of "acquire+release" or RVTSO semantics. The -remaining seven combinations have empty predecessor and/or successor -sets and hence are no-ops. Of the ten non-trivial options, only six are -commonly used in practice:

-
-
-
    -
  • -

    FENCE RW,RW

    -
  • -
  • -

    FENCE.TSO

    -
  • -
  • -

    FENCE RW,W

    -
  • -
  • -

    FENCE R,RW

    -
  • -
  • -

    FENCE R,R

    -
  • -
  • -

    FENCE W,W

    -
  • -
-
-
-

FENCE instructions using any other combination of PR, PW, SR, and SW are -reserved. We strongly recommend that programmers stick to these six. -Other combinations may have unknown or unexpected interactions with the -memory model.

-
-
-

Finally, we note that since RISC-V uses a multi-copy atomic memory -model, programmers can reason about fences bits in a thread-local -manner. There is no complex notion of "fence cumulativity" as found in -memory models that are not multi-copy atomic.

-
-
-
-

A.3.7. Explicit Synchronization (Rules 5-8)

-
- - - - - -
- - -
-

Rule 5: a has an acquire annotation

-
-
-

Rule 6: b has a release annotation

-
-
-

Rule 7: a and b both have RCsc annotations

-
-
-

Rule 8: a is paired with b

-
-
-
-
-

An acquire operation, as would be used at the start of a critical -section, requires all memory operations following the acquire in program -order to also follow the acquire in the global memory order. This -ensures, for example, that all loads and stores inside the critical -section are up to date with respect to the synchronization variable -being used to protect it. Acquire ordering can be enforced in one of two -ways: with an acquire annotation, which enforces ordering with respect -to just the synchronization variable itself, or with a FENCE R,RW, which -enforces ordering with respect to all previous loads.

-
-
-
Listing 2. A spinlock with atomics
-
-
          sd           x1, (a1)     # Arbitrary unrelated store
-          ld           x2, (a2)     # Arbitrary unrelated load
-          li           t0, 1        # Initialize swap value.
-      again:
-          amoswap.w.aq t0, t0, (a0) # Attempt to acquire lock.
-          bnez         t0, again    # Retry if held.
-          # ...
-          # Critical section.
-          # ...
-          amoswap.w.rl x0, x0, (a0) # Release lock by storing 0.
-          sd           x3, (a3)     # Arbitrary unrelated store
-          ld           x4, (a4)     # Arbitrary unrelated load
-
-
-
-

Consider Example 1. -Because this example uses aq, the loads and stores in the critical -section are guaranteed to appear in the global memory order after the -AMOSWAP used to acquire the lock. However, assuming a0, a1, and a2 -point to different memory locations, the loads and stores in the -critical section may or may not appear after the "Arbitrary unrelated -load" at the beginning of the example in the global memory order.

-
-
-
Listing 3. A spinlock with fences
-
-
          sd           x1, (a1)     # Arbitrary unrelated store
-          ld           x2, (a2)     # Arbitrary unrelated load
-          li           t0, 1        # Initialize swap value.
-      again:
-          amoswap.w    t0, t0, (a0) # Attempt to acquire lock.
-          fence        r, rw        # Enforce "acquire" memory ordering
-          bnez         t0, again    # Retry if held.
-          # ...
-          # Critical section.
-          # ...
-          fence        rw, w        # Enforce "release" memory ordering
-          amoswap.w    x0, x0, (a0) # Release lock by storing 0.
-          sd           x3, (a3)     # Arbitrary unrelated store
-          ld           x4, (a4)     # Arbitrary unrelated load
-
-
-
-

Now, consider the alternative in Example 2. In -this case, even though the AMOSWAP does not enforce ordering with an -aq bit, the fence nevertheless enforces that the acquire AMOSWAP -appears earlier in the global memory order than all loads and stores in -the critical section. Note, however, that in this case, the fence also -enforces additional orderings: it also requires that the "Arbitrary -unrelated load" at the start of the program appears earlier in the -global memory order than the loads and stores of the critical section. -(This particular fence does not, however, enforce any ordering with -respect to the "Arbitrary unrelated store" at the start of the -snippet.) In this way, fence-enforced orderings are slightly coarser -than orderings enforced by .aq.

-
-
-

Release orderings work exactly the same as acquire orderings, just in -the opposite direction. Release semantics require all loads and stores -preceding the release operation in program order to also precede the -release operation in the global memory order. This ensures, for example, -that memory accesses in a critical section appear before the -lock-releasing store in the global memory order. Just as for acquire -semantics, release semantics can be enforced using release annotations -or with a FENCE RW,W operation. Using the same examples, the ordering -between the loads and stores in the critical section and the "Arbitrary -unrelated store" at the end of the code snippet is enforced only by the -FENCE RW,W in Example 2, not by -the rl in Example 1.

-
-
-

With RCpc annotations alone, store-release-to-load-acquire ordering is -not enforced. This facilitates the porting of code written under the TSO -and/or RCpc memory models. To enforce store-release-to-load-acquire -ordering, the code must use store-release-RCsc and load-acquire-RCsc -operations so that PPO rule 7 applies. RCpc alone is -sufficient for many use cases in C/C but is insufficient for many -other use cases in C/C, Java, and Linux, to name just a few examples; -see Memory Porting for details.

-
-
-

PPO rule 8 indicates that an SC must appear after -its paired LR in the global memory order. This will follow naturally -from the common use of LR/SC to perform an atomic read-modify-write -operation due to the inherent data dependency. However, PPO -rule 8 also applies even when the value being stored -does not syntactically depend on the value returned by the paired LR.

-
-
-

Lastly, we note that just as with fences, programmers need not worry -about "cumulativity" when analyzing ordering annotations.

-
-
-
-

A.3.8. Syntactic Dependencies (Rules 9-11)

-
- - - - - -
- - -
-

Rule 9: b has a syntactic address dependency on a

-
-
-

Rule 10: b has a syntactic data dependency on a

-
-
-

Rule 11: b is a store, and b has a syntactic control dependency on a

-
-
-
-
-

Dependencies from a load to a later memory operation in the same hart -are respected by the RVWMO memory model. The Alpha memory model was -notable for choosing not to enforce the ordering of such dependencies, -but most modern hardware and software memory models consider allowing -dependent instructions to be reordered too confusing and -counterintuitive. Furthermore, modern code sometimes intentionally uses -such dependencies as a particularly lightweight ordering enforcement -mechanism.

-
-
-

The terms in Section 18.1.2 work as follows. Instructions -are said to carry dependencies from their -source register(s) to their destination register(s) whenever the value -written into each destination register is a function of the source -register(s). For most instructions, this means that the destination -register(s) carry a dependency from all source register(s). However, -there are a few notable exceptions. In the case of memory instructions, -the value written into the destination register ultimately comes from -the memory system rather than from the source register(s) directly, and -so this breaks the chain of dependencies carried from the source -register(s). In the case of unconditional jumps, the value written into -the destination register comes from the current pc (which is never -considered a source register by the memory model), and so likewise, JALR -(the only jump with a source register) does not carry a dependency from -rs1 to rd.

-
-
-
Listing 4. (c) has a syntactic dependency on both (a) and (b) via fflags, a destination register that both (a) and (b) implicitly accumulate into
-
-
(a) fadd f3,f1,f2
-(b) fadd f6,f4,f5
-(c) csrrs a0,fflags,x0
-
-
-
-

The notion of accumulating into a destination register rather than -writing into it reflects the behavior of CSRs such as fflags. In -particular, an accumulation into a register does not clobber any -previous writes or accumulations into the same register. For example, in -Listing 4, (c) has a syntactic dependency on both (a) and (b).

-
-
-

Like other modern memory models, the RVWMO memory model uses syntactic -rather than semantic dependencies. In other words, this definition -depends on the identities of the registers being accessed by different -instructions, not the actual contents of those registers. This means -that an address, control, or data dependency must be enforced even if -the calculation could seemingly be optimized away. This choice -ensures that RVWMO remains compatible with code that uses these false -syntactic dependencies as a lightweight ordering mechanism.

-
-
-
Listing 5. A syntactic address dependency
-
-
ld a1,0(s0)
-xor a2,a1,a1
-add s1,s1,a2
-ld a5,0(s1)
-
-
-
-

For example, there is a syntactic address dependency from the memory -operation generated by the first instruction to the memory operation -generated by the last instruction in -Listing 5, even though a1 XOR -a1 is zero and hence has no effect on the address accessed by the -second load.

-
-
-

The benefit of using dependencies as a lightweight synchronization -mechanism is that the ordering enforcement requirement is limited only -to the specific two instructions in question. Other non-dependent -instructions may be freely reordered by aggressive implementations. One -alternative would be to use a load-acquire, but this would enforce -ordering for the first load with respect to all subsequent -instructions. Another would be to use a FENCE R,R, but this would -include all previous and all subsequent loads, making this option more -expensive.

-
-
-
Listing 6. A syntactic control dependency
-
-
lw x1,0(x2)
-bne x1,x0,next
-sw x3,0(x4)
-next: sw x5,0(x6)
-
-
-
-

Control dependencies behave differently from address and data -dependencies in the sense that a control dependency always extends to -all instructions following the original target in program order. -Consider Listing 6 the -instruction at next will always execute, but the memory operation -generated by that last instruction nevertheless still has a control -dependency from the memory operation generated by the first instruction.

-
-
-
Listing 7. Another syntactic control dependency
-
-
lw x1,0(x2)
-bne x1,x0,next
-next: sw x3,0(x4)
-
-
-
-

Likewise, consider Listing 7. -Even though both branch outcomes have the same target, there is still a -control dependency from the memory operation generated by the first -instruction in this snippet to the memory operation generated by the -last instruction. This definition of control dependency is subtly -stronger than what might be seen in other contexts (e.g., C++), but it -conforms with standard definitions of control dependencies in the -literature.

-
-
-

Notably, PPO rules 9-11 are also -intentionally designed to respect dependencies that originate from the -output of a successful store-conditional instruction. Typically, an SC -instruction will be followed by a conditional branch checking whether -the outcome was successful; this implies that there will be a control -dependency from the store operation generated by the SC instruction to -any memory operations following the branch. PPO -rule 11 in turn implies that any subsequent store -operations will appear later in the global memory order than the store -operation generated by the SC. However, since control, address, and data -dependencies are defined over memory operations, and since an -unsuccessful SC does not generate a memory operation, no order is -enforced between unsuccessful SC and its dependent instructions. -Moreover, since SC is defined to carry dependencies from its source -registers to rd only when the SC is successful, an unsuccessful SC has -no effect on the global memory order.

-
- - ---- - - - - - - -
Table 38. A variant of the LB litmus test (outcome forbidden)
------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Initial values: 0(s0)=1; 0(s2)=1

Hart 0

Hart 1

(a)

ld a0,0(s0)

(e)

ld a3,0(s2)

(b)

lr a1,0(s1)

(f)

sd a3,0(s0)

(c)

sc a2,a0,0(s1)

(d)

sd a2,0(s2)

Outcome: a0=0, a3=0

--- - - - - - -
-
-litmus lb lrsc -
-
-
-

In addition, the choice to respect dependencies originating at -store-conditional instructions ensures that certain out-of-thin-air-like -behaviors will be prevented. Consider -Table 38. Suppose a -hypothetical implementation could occasionally make some early guarantee -that a store-conditional operation will succeed. In this case, (c) could -return 0 to a2 early (before actually executing), allowing the -sequence (d), (e), (f), (a), and then (b) to execute, and then (c) might -execute (successfully) only at that point. This would imply that (c) -writes its own success value to 0(s1)! Fortunately, this situation and -others like it are prevented by the fact that RVWMO respects -dependencies originating at the stores generated by successful SC -instructions.

-
-
-

We also note that syntactic dependencies between instructions only have -any force when they take the form of a syntactic address, control, -and/or data dependency. For example: a syntactic dependency between two -F instructions via one of the accumulating CSRs in -Section 18.3 does not imply -that the two F instructions must be executed in order. Such a -dependency would only serve to ultimately set up later a dependency from -both F instructions to a later CSR instruction accessing the CSR -flag in question.

-
-
-
-

A.3.9. Pipeline Dependencies (Rules 12-13)

-
- - - - - -
- - -
-

Rule 12: b is a load, and there exists some store m between a and b in -program order such that m has an address or data dependency on a, -and b returns a value written by m

-
-
-

Rule 13: b is a store, and there exists some instruction m between a and -b in program order such that m has an address dependency on a

-
-
-
- - ---- - - - - - - -
Table 39. Because of PPO rule 12 and the data dependency from (d) to (e), (d) must also precede (f) in the global memory order (outcome forbidden)
------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Hart 0Hart 1

li t1, 1

(d)

lw a0, 0(s1)

(a)

sw t1,0(s0)

(e)

sw a0, 0(s2)

(b)

fence w, w

(f)

lw a1, 0(s2)

(c)

sw t1,0(s1)

xor a2,a1,a1

add s0,s0,a2

(g)

lw a3,0(s0)

Outcome: a0=1, a3=0

--- - - - - - -
-
-litmus datarfi -
-
-
-

PPO rules 12 and 13 reflect behaviors of almost all real processor -pipeline implementations. Rule 12 -states that a load cannot forward from a store until the address and -data for that store are known. Consider Table 39 (f) cannot be -executed until the data for (e) has been resolved, because (f) must -return the value written by (e) (or by something even later in the -global memory order), and the old value must not be clobbered by the -writeback of (e) before (d) has had a chance to perform. Therefore, (f) -will never perform before (d) has performed.

-
- - ---- - - - - - - -
Table 40. Because of the extra store between (e) and (g), (d) no longer necessarily precedes (g) (outcome permitted)
------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Hart 0Hart 1

li t1, 1

li t1, 1

(a)

sw t1,0(s0)

(d)

lw a0, 0(s1)

(b)

fence w, w

(e)

sw a0, 0(s2)

(c)

sw t1,0(s1)

(f)

sw t1, 0(s2)

(g)

lw a1, 0(s2)

xor a2,a1,a1

add s0,s0,a2

(h)

lw a3,0(s0)

Outcome: a0=1, a3=0

--- - - - - - -
-
-litmus datacoirfi -
-
-
-

If there were another store to the same address in between (e) and (f), -as in Table 41, -then (f) would no longer be dependent on the data of (e) being resolved, -and hence the dependency of (f) on (d), which produces the data for (e), -would be broken.

-
-
-

Rule13 makes a similar observation to the -previous rule: a store cannot be performed at memory until all previous -loads that might access the same address have themselves been performed. -Such a load must appear to execute before the store, but it cannot do so -if the store were to overwrite the value in memory before the load had a -chance to read the old value. Likewise, a store generally cannot be -performed until it is known that preceding instructions will not cause -an exception due to failed address resolution, and in this sense, -rule 13 can be seen as somewhat of a special case -of rule 11.

-
- - ---- - - - - - - -
Table 41. Because of the address dependency from (d) to (e), (d) also precedes (f) (outcome forbidden)
------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Hart 0Hart 1

li t1, 1

(a)

lw a0,0(s0)

(d)

lw a1, 0(s1)

(b)

fence rw,rw

(e)

lw a2, 0(a1)

(c)

sw s2,0(s1)

(f)

sw t1, 0(s0)

Outcome: a0=1, a1=t

--- - - - - - -
-

litmus addrpo

-
-
-

Consider Table 41 (f) cannot be -executed until the address for (e) is resolved, because it may turn out -that the addresses match; i.e., that a1=s0. Therefore, (f) cannot be -sent to memory before (d) has executed and confirmed whether the -addresses do indeed overlap.

-
-
-
-
-

A.4. Beyond Main Memory

-
-

RVWMO does not currently attempt to formally describe how FENCE.I, -SFENCE.VMA, I/O fences, and PMAs behave. All of these behaviors will be -described by future formalizations. In the meantime, the behavior of -FENCE.I is described in Chapter 6, the -behavior of SFENCE.VMA is described in the RISC-V Instruction Set -Privileged Architecture Manual, and the behavior of I/O fences and the -effects of PMAs are described below.

-
-
-

A.4.1. Coherence and Cacheability

-
-

The RISC-V Privileged ISA defines Physical Memory Attributes (PMAs) -which specify, among other things, whether portions of the address space -are coherent and/or cacheable. See the RISC-V Privileged ISA -Specification for the complete details. Here, we simply discuss how the -various details in each PMA relate to the memory model:

-
-
-
    -
  • -

    Main memory vs.I/O, and I/O memory ordering PMAs: the memory model as -defined applies to main memory regions. I/O ordering is discussed below.

    -
  • -
  • -

    Supported access types and atomicity PMAs: the memory model is simply -applied on top of whatever primitives each region supports.

    -
  • -
  • -

    Cacheability PMAs: the cacheability PMAs in general do not affect the -memory model. Non-cacheable regions may have more restrictive behavior -than cacheable regions, but the set of allowed behaviors does not change -regardless. However, some platform-specific and/or device-specific -cacheability settings may differ.

    -
  • -
  • -

    Coherence PMAs: The memory consistency model for memory regions marked -as non-coherent in PMAs is currently platform-specific and/or -device-specific: the load-value axiom, the atomicity axiom, and the -progress axiom all may be violated with non-coherent memory. Note -however that coherent memory does not require a hardware cache coherence -protocol. The RISC-V Privileged ISA Specification suggests that -hardware-incoherent regions of main memory are discouraged, but the -memory model is compatible with hardware coherence, software coherence, -implicit coherence due to read-only memory, implicit coherence due to -only one agent having access, or otherwise.

    -
  • -
  • -

    Idempotency PMAs: Idempotency PMAs are used to specify memory regions -for which loads and/or stores may have side effects, and this in turn is -used by the microarchitecture to determine, e.g., whether prefetches are -legal. This distinction does not affect the memory model.

    -
  • -
-
-
-
-

A.4.2. I/O Ordering

-
-

For I/O, the load value axiom and atomicity axiom in general do not -apply, as both reads and writes might have device-specific side effects -and may return values other than the value "written" by the most -recent store to the same address. Nevertheless, the following preserved -program order rules still generally apply for accesses to I/O memory: -memory access a precedes memory access b in -global memory order if a precedes b in -program order and one or more of the following holds:

-
-
-
    -
  1. -

    a precedes b in preserved program order as -defined in Chapter 18, with the exception -that acquire and release ordering annotations apply only from one memory -operation to another memory operation and from one I/O operation to -another I/O operation, but not from a memory operation to an I/O nor -vice versa

    -
  2. -
  3. -

    a and b are accesses to overlapping -addresses in an I/O region

    -
  4. -
  5. -

    a and b are accesses to the same strongly -ordered I/O region

    -
  6. -
  7. -

    a and b are accesses to I/O regions, and -the channel associated with the I/O region accessed by either -a or b is channel 1

    -
  8. -
  9. -

    a and b are accesses to I/O regions -associated with the same channel (except for channel 0)

    -
  10. -
-
-
-

Note that the FENCE instruction distinguishes between main memory -operations and I/O operations in its predecessor and successor sets. To -enforce ordering between I/O operations and main memory operations, code -must use a FENCE with PI, PO, SI, and/or SO, plus PR, PW, SR, and/or SW. -For example, to enforce ordering between a write to main memory and an -I/O write to a device register, a FENCE W,O or stronger is needed.

-
-
-
Listing 8. Ordering memory and I/O accesses
-
-
sd t0, 0(a0)
-fence w,o
-sd a0, 0(a1)
-
-
-
-

When a fence is in fact used, implementations must assume that the -device may attempt to access memory immediately after receiving the MMIO -signal, and subsequent memory accesses from that device to memory must -observe the effects of all accesses ordered prior to that MMIO -operation. In other words, in Listing 8, -suppose 0(a0) is in main memory and 0(a1) is the address of a device -register in I/O memory. If the device accesses 0(a0) upon receiving -the MMIO write, then that load must conceptually appear after the first -store to 0(a0) according to the rules of the RVWMO memory model. In -some implementations, the only way to ensure this will be to require -that the first store does in fact complete before the MMIO write is -issued. Other implementations may find ways to be more aggressive, while -others still may not need to do anything different at all for I/O and -main memory accesses. Nevertheless, the RVWMO memory model does not -distinguish between these options; it simply provides an -implementation-agnostic mechanism to specify the orderings that must be -enforced.

-
-
-

Many architectures include separate notions of "ordering" and -`completion" fences, especially as it relates to I/O (as opposed to -regular main memory). Ordering fences simply ensure that memory -operations stay in order, while completion fences ensure that -predecessor accesses have all completed before any successors are made -visible. RISC-V does not explicitly distinguish between ordering and -completion fences. Instead, this distinction is simply inferred from -different uses of the FENCE bits.

-
-
-

For implementations that conform to the RISC-V Unix Platform -Specification, I/O devices and DMA operations are required to access -memory coherently and via strongly ordered I/O channels. Therefore, -accesses to regular main memory regions that are concurrently accessed -by external devices can also use the standard synchronization -mechanisms. Implementations that do not conform to the Unix Platform -Specification and/or in which devices do not access memory coherently -will need to use mechanisms (which are currently platform-specific or -device-specific) to enforce coherency.

-
-
-

I/O regions in the address space should be considered non-cacheable -regions in the PMAs for those regions. Such regions can be considered -coherent by the PMA if they are not cached by any agent.

-
-
-

The ordering guarantees in this section may not apply beyond a -platform-specific boundary between the RISC-V cores and the device. In -particular, I/O accesses sent across an external bus (e.g., PCIe) may be -reordered before they reach their ultimate destination. Ordering must be -enforced in such situations according to the platform-specific rules of -those external devices and buses.

-
-
-
-
-

A.5. Code Porting and Mapping Guidelines

- - ---- - - - - - - - - - - - - - - - - - - - - - - - - -
Table 42. Mappings from TSO operations to RISC-V operations
x86/TSO OperationRVWMO Mapping

Load

l{b|h|w|d}; fence r,rw

Store

fence rw,w; s{b|h|w|d}

Atomic RMW

amo<op>.{w|d}.aqrl OR
-loop:lr.{w|d}.aq; <op>; sc.{w|d}.aqrl; bnez loop

Fence

fence rw,rw

-
-

Table 42 provides a mapping from TSO memory -operations onto RISC-V memory instructions. Normal x86 loads and stores -are all inherently acquire-RCpc and release-RCpc operations: TSO -enforces all load-load, load-store, and store-store ordering by default. -Therefore, under RVWMO, all TSO loads must be mapped onto a load -followed by FENCE R,RW, and all TSO stores must be mapped onto -FENCE RW,W followed by a store. TSO atomic read-modify-writes and x86 -instructions using the LOCK prefix are fully ordered and can be -implemented either via an AMO with both aq and rl set, or via an LR -with aq set, the arithmetic operation in question, an SC with both -aq and rl set, and a conditional branch checking the success -condition. In the latter case, the rl annotation on the LR turns out -(for non-obvious reasons) to be redundant and can be omitted.

-
-
-

Alternatives to Table 42 are also possible. A TSO -store can be mapped onto AMOSWAP with rl set. However, since RVWMO PPO -Rule 3 forbids forwarding of values from -AMOs to subsequent loads, the use of AMOSWAP for stores may negatively -affect performance. A TSO load can be mapped using LR with aq set: all -such LR instructions will be unpaired, but that fact in and of itself -does not preclude the use of LR for loads. However, again, this mapping -may also negatively affect performance if it puts more pressure on the -reservation mechanism than was originally intended.

-
- - ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 43. Mappings from Power operations to RISC-V operations
Power OperationRVWMO Mapping

Load

l{b|h|w|d}

Load-Reserve

lr.{w|d}

Store

s{b|h|w|d}

Store-Conditional

sc.{w|d}

lwsync

fence.tso

sync

fence rw,rw

isync

fence.i; fence r,r

-
-

Table 43 provides a mapping from Power memory -operations onto RISC-V memory instructions. Power ISYNC maps on RISC-V -to a FENCE.I followed by a FENCE R,R; the latter fence is needed because -ISYNC is used to define a "control+control fence" dependency that is -not present in RVWMO.

-
- - ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 44. Mappings from ARM operations to RISC-V operations
ARM OperationRVWMO Mapping

Load

l{b|h|w|d}

Load-Acquire

fence rw, rw; l{b|h|w|d}; fence r,rw

Load-Exclusive

lr.{w|d}

Load-Acquire-Exclusive

lr.{w|d}.aqrl

Store

s{b|h|w|d}

Store-Release

fence rw,w; s{b|h|w|d}

Store-Exclusive

sc.{w|d}

Store-Release-Exclusive

sc.{w|d}.rl

dmb

fence rw,rw

dmb.ld

fence r,rw

dmb.st

fence w,w

isb

fence.i; fence r,r

-
-

Table 44 provides a mapping from ARM memory -operations onto RISC-V memory instructions. Since RISC-V does not -currently have plain load and store opcodes with aq or rl -annotations, ARM load-acquire and store-release operations should be -mapped using fences instead. Furthermore, in order to enforce -store-release-to-load-acquire ordering, there must be a FENCE RW,RW -between the store-release and load-acquire; Table 44 -enforces this by always placing the fence in front of each acquire -operation. ARM load-exclusive and store-exclusive instructions can -likewise map onto their RISC-V LR and SC equivalents, but instead of -placing a FENCE RW,RW in front of an LR with aq set, we simply also -set rl instead. ARM ISB maps on RISC-V to FENCE.I followed by -FENCE R,R similarly to how ISYNC maps for Power.

-
- - ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 45. Mappings from Linux memory primitives to RISC-V primitives.
Linux OperationRVWMO Mapping

smp_mb()

fence rw,rw

smp_rmb()

fence r,r

smp_wmb()

fence w,w

dma_rmb()

fence r,r

dma_wmb()

fence w,w

mb()

fence iorw,iorw

rmb()

fence ri,ri

wmb()

fence wo,wo

smp_load_acquire()

l{b|h|w|d}; fence r,rw

smp_store_release()

fence.tso; s{b|h|w|d}

Linux Construct

RVWMO AMO Mapping

atomic <op> relaxed

amo <op>.{w|d}

atomic <op> acquire

amo <op>.{w|d}.aq

atomic <op> release

amo <op>.{w|d}.rl

atomic <op>

amo <op>.{w|d}.aqrl

Linux Construct

RVWMO LR/SC Mapping

atomic <op> relaxed

loop:lr.{w|d}; <op>; sc.{w|d}; bnez loop

atomic <op> acquire

loop:lr.{w|d}.aq; <op>; sc.{w|d}; bnez loop

atomic <op> release

loop:lr.{w|d}; <op>; sc.{w|d}.aqrl^*; bnez loop OR

fence.tso; loop:lr.{w|d}; <op >; sc.{w|d}^*; bnez loop

atomic <op>

loop:lr.{w|d}.aq; <op>; sc.{w|d}.aqrl; bnez loop

-
-

With regards to Table 45, other -constructs (such as spinlocks) should follow accordingly. Platforms or -devices with non-coherent DMA may need additional synchronization (such -as cache flush or invalidate mechanisms); currently any such extra -synchronization will be device-specific.

-
-
-

Table 45 provides a mapping of Linux memory -ordering macros onto RISC-V memory instructions. The Linux fences -dma_rmb() and dma_wmb() map onto FENCE R,R and FENCE W,W, -respectively, since the RISC-V Unix Platform requires coherent DMA, but -would be mapped onto FENCE RI,RI and FENCE WO,WO, respectively, on a -platform with non-coherent DMA. Platforms with non-coherent DMA may also -require a mechanism by which cache lines can be flushed and/or -invalidated. Such mechanisms will be device-specific and/or standardized -in a future extension to the ISA.

-
-
-

The Linux mappings for release operations may seem stronger than -necessary, but these mappings are needed to cover some cases in which -Linux requires stronger orderings than the more intuitive mappings would -provide. In particular, as of the time this text is being written, Linux -is actively debating whether to require load-load, load-store, and -store-store orderings between accesses in one critical section and -accesses in a subsequent critical section in the same hart and protected -by the same synchronization object. Not all combinations of -FENCE RW,W/FENCE R,RW mappings with aq/rl mappings combine to -provide such orderings. There are a few ways around this problem, -including:

-
-
-
    -
  1. -

    Always use FENCE RW,W/FENCE R,RW, and never use aq/rl. This -suffices but is undesirable, as it defeats the purpose of the aq/rl -modifiers.

    -
  2. -
  3. -

    Always use aq/rl, and never use FENCE RW,W/FENCE R,RW. This does -not currently work due to the lack of load and store opcodes with aq -and rl modifiers.

    -
  4. -
  5. -

    Strengthen the mappings of release operations such that they would -enforce sufficient orderings in the presence of either type of acquire -mapping. This is the currently recommended solution, and the one shown -in Table 45.

    -
  6. -
-
-
-

RVWMO Mapping: (a) lw a0, 0(s0) (b) fence.tso // vs. fence rw,w (c) sd -x0,0(s1) …​ loop: (d) amoswap.d.aq a1,t1,0(s1) bnez a1,loop (e) lw -a2,0(s2)

-
-
-

For example, the critical section ordering rule currently being debated -by the Linux community would require (a) to be ordered before (e) in -Listing 9. If that will indeed be -required, then it would be insufficient for (b) to map as FENCE RW,W. -That said, these mappings are subject to change as the Linux Kernel -Memory Model evolves.

-
-
-
Listing 9. Orderings between critical sections in Linux
-
-
Linux Code:
-(a) int r0 = *x;
-       (bc) spin_unlock(y, 0);
-....
-....
-(d) spin_lock(y);
-(e) int r1 = *z;
-
-RVWMO Mapping:
-(a) lw a0, 0(s0)
-(b) fence.tso // vs. fence rw,w
-(c) sd x0,0(s1)
-....
-loop:
-(d) lr.d.aq a1,(s1)
-bnez a1,loop
-sc.d a1,t1,(s1)
-bnez a1,loop
-(e) lw a2,0(s2)
-
-
-
-

Table 46 provides a mapping of C11/C++11 atomic -operations onto RISC-V memory instructions. If load and store opcodes -with aq and rl modifiers are introduced, then the mappings in -Table 47 will suffice. Note however that -the two mappings only interoperate correctly if -atomic_<op>(memory_order_seq_cst) is mapped using an LR that has both -aq and rl set. -Even more importantly, a Table 46 sequentially consistent store, -followed by a Table 47 sequentially consistent load -can be reordered unless the Table 46 mapping of stores is -strengthened by either adding a second fence or mapping the store -to amoswap.rl instead.

-
- - ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 46. Mappings from C/C++ primitives to RISC-V primitives.
C/C++ ConstructRVWMO Mapping

Non-atomic load

l{b|h|w|d}

atomic_load(memory_order_relaxed)

l{b|h|w|d}

atomic_load(memory_order_acquire)

l{b|h|w|d}; fence r,rw

atomic_load(memory_order_seq_cst)

fence rw,rw; l{b|h|w|d}; fence r,rw

Non-atomic store

s{b|h|w|d}

atomic_store(memory_order_relaxed)

s{b|h|w|d}

atomic_store(memory_order_release)

fence rw,w; s{b|h|w|d}

atomic_store(memory_order_seq_cst)

fence rw,w; s{b|h|w|d}

atomic_thread_fence(memory_order_acquire)

fence r,rw

atomic_thread_fence(memory_order_release)

fence rw,w

atomic_thread_fence(memory_order_acq_rel)

fence.tso

atomic_thread_fence(memory_order_seq_cst)

fence rw,rw

C/C++ Construct

RVWMO AMO Mapping

atomic_<op>(memory_order_relaxed)

amo<op>.{w|d}

atomic_<op>(memory_order_acquire)

amo<op>.{w|d}.aq

atomic_<op>(memory_order_release)

amo<op>.{w|d}.rl

atomic_<op>(memory_order_acq_rel)

amo<op>.{w|d}.aqrl

atomic_<op>(memory_order_seq_cst)

amo<op>.{w|d}.aqrl

C/C++ Construct

RVWMO LR/SC Mapping

atomic_<op>(memory_order_relaxed)

loop:lr.{w|d}; <op>; sc.{w|d};

bnez loop

atomic_<op>(memory_order_acquire)

loop:lr.{w|d}.aq; <op>; sc.{w|d};

bnez loop

atomic_<op>(memory_order_release)

loop:lr.{w|d}; <op>; sc.{w|d}.rl;

bnez loop

atomic_<op>(memory_order_acq_rel)

loop:lr.{w|d}.aq; <op>; sc.{w|d}.rl;

bnez loop

atomic_<op>(memory_order_seq_cst)

loop:lr.{w|d}.aqrl; <op>;

sc.{w|d}.rl; bnez loop

- - ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 47. Hypothetical mappings from C/C++ primitives to RISC-V primitives, if native load-acquire and store-release opcodes are introduced.
C/C++ ConstructRVWMO Mapping

Non-atomic load

l{b|h|w|d}

atomic_load(memory_order_relaxed)

l{b|h|w|d}

atomic_load(memory_order_acquire)

l{b|h|w|d}.aq

atomic_load(memory_order_seq_cst)

l{b|h|w|d}.aq

Non-atomic store

s{b|h|w|d}

atomic_store(memory_order_relaxed)

s{b|h|w|d}

atomic_store(memory_order_release)

s{b|h|w|d}.rl

atomic_store(memory_order_seq_cst)

s{b|h|w|d}.rl

atomic_thread_fence(memory_order_acquire)

fence r,rw

atomic_thread_fence(memory_order_release)

fence rw,w

atomic_thread_fence(memory_order_acq_rel)

fence.tso

atomic_thread_fence(memory_order_seq_cst)

fence rw,rw

C/C++ Construct

RVWMO AMO Mapping

atomic_<op>(memory_order_relaxed)

amo<op>.{w|d}

atomic_<op>(memory_order_acquire)

amo<op>.{w|d}.aq

atomic_<op>(memory_order_release)

amo<op>.{w|d}.rl

atomic_<op>(memory_order_acq_rel)

amo<op>.{w|d}.aqrl

atomic_<op>(memory_order_seq_cst)

amo<op>.{w|d}.aqrl

C/C++ Construct

RVWMO LR/SC Mapping

atomic_<op>(memory_order_relaxed)

lr.{w|d}; <op>; sc.{w|d}

atomic_<op>(memory_order_acquire)

lr.{w|d}.aq; <op>; sc.{w|d}

atomic_<op>(memory_order_release)

lr.{w|d}; <op>; sc.{w|d}.rl

atomic_<op>(memory_order_acq_rel)

lr.{w|d}.aq; <op>; sc.{w|d}.rl

atomic_<op>(memory_order_seq_cst)

lr.{w|d}.aq* <op>; sc.{w|d}.rl

* must be lr.{w|d}.aqrl in order to interoperate with code mapped per Table 46

-
-

Any AMO can be emulated by an LR/SC pair, but care must be taken to -ensure that any PPO orderings that originate from the LR are also made -to originate from the SC, and that any PPO orderings that terminate at -the SC are also made to terminate at the LR. For example, the LR must -also be made to respect any data dependencies that the AMO has, given -that load operations do not otherwise have any notion of a data -dependency. Likewise, the effect a FENCE R,R elsewhere in the same hart -must also be made to apply to the SC, which would not otherwise respect -that fence. The emulator may achieve this effect by simply mapping AMOs -onto lr.aq; <op>; sc.aqrl, matching the mapping used elsewhere for -fully ordered atomics.

-
-
-

These C11/C++11 mappings require the platform to provide the following -Physical Memory Attributes (as defined in the RISC-V Privileged ISA) for -all memory:

-
-
-
    -
  • -

    main memory

    -
  • -
  • -

    coherent

    -
  • -
  • -

    AMOArithmetic

    -
  • -
  • -

    RsrvEventual

    -
  • -
-
-
-

Platforms with different attributes may require different mappings, or -require platform-specific SW (e.g., memory-mapped I/O).

-
-
-
-

A.6. Implementation Guidelines

-
-

The RVWMO and RVTSO memory models by no means preclude -microarchitectures from employing sophisticated speculation techniques -or other forms of optimization in order to deliver higher performance. -The models also do not impose any requirement to use any one particular -cache hierarchy, nor even to use a cache coherence protocol at all. -Instead, these models only specify the behaviors that can be exposed to -software. Microarchitectures are free to use any pipeline design, any -coherent or non-coherent cache hierarchy, any on-chip interconnect, -etc., as long as the design only admits executions that satisfy the -memory model rules. That said, to help people understand the actual -implementations of the memory model, in this section we provide some -guidelines on how architects and programmers should interpret the -models' rules.

-
-
-

Both RVWMO and RVTSO are multi-copy atomic (or -other-multi-copy-atomic): any store value that is visible to a hart -other than the one that originally issued it must also be conceptually -visible to all other harts in the system. In other words, harts may -forward from their own previous stores before those stores have become -globally visible to all harts, but no early inter-hart forwarding is -permitted. Multi-copy atomicity may be enforced in a number of ways. It -might hold inherently due to the physical design of the caches and store -buffers, it may be enforced via a single-writer/multiple-reader cache -coherence protocol, or it might hold due to some other mechanism.

-
-
-

Although multi-copy atomicity does impose some restrictions on the -microarchitecture, it is one of the key properties keeping the memory -model from becoming extremely complicated. For example, a hart may not -legally forward a value from a neighbor hart’s private store buffer -(unless of course it is done in such a way that no new illegal behaviors -become architecturally visible). Nor may a cache coherence protocol -forward a value from one hart to another until the coherence protocol -has invalidated all older copies from other caches. Of course, -microarchitectures may (and high-performance implementations likely -will) violate these rules under the covers through speculation or other -optimizations, as long as any non-compliant behaviors are not exposed to -the programmer.

-
-
-

As a rough guideline for interpreting the PPO rules in RVWMO, we expect -the following from the software perspective:

-
-
-
    -
  • -

    programmers will use PPO rules 1 and 4-8 regularly and actively.

    -
  • -
  • -

    expert programmers will use PPO rules 9-11 to speed up critical paths -of important data structures.

    -
  • -
  • -

    even expert programmers will rarely if ever use PPO rules 2-3 and -12-13 directly. -These are included to facilitate common microarchitectural optimizations -(rule 2) and the operational formal modeling approach (rules 3 and -12-13) described -in Section B.3. They also facilitate the -process of porting code from other architectures that have similar -rules.

    -
  • -
-
-
-

We also expect the following from the hardware perspective:

-
-
-
    -
  • -

    PPO rules 1 and 3-6 reflect -well-understood rules that should pose few surprises to architects.

    -
  • -
  • -

    PPO rule 2 reflects a natural and common hardware -optimization, but one that is very subtle and hence is worth double -checking carefully.

    -
  • -
  • -

    PPO rule 7 may not be immediately obvious to -architects, but it is a standard memory model requirement

    -
  • -
  • -

    The load value axiom, the atomicity axiom, and PPO rules -8-13 reflect rules that most -hardware implementations will enforce naturally, unless they contain -extreme optimizations. Of course, implementations should make sure to -double check these rules nevertheless. Hardware must also ensure that -syntactic dependencies are not optimized away.

    -
  • -
-
-
-

Architectures are free to implement any of the memory model rules as -conservatively as they choose. For example, a hardware implementation -may choose to do any or all of the following:

-
-
-
    -
  • -

    interpret all fences as if they were FENCE RW,RW (or FENCE IORW,IORW, -if I/O is involved), regardless of the bits actually set

    -
  • -
  • -

    implement all fences with PW and SR as if they were FENCE RW,RW (or -FENCE IORW,IORW, if I/O is involved), as PW with SR is the most -expensive of the four possible main memory ordering components anyway

    -
  • -
  • -

    emulate aq and rl as described in Section A.5

    -
  • -
  • -

    enforcing all same-address load-load ordering, even in the presence of -patterns such as fri-rfi and RSW

    -
  • -
  • -

    forbid any forwarding of a value from a store in the store buffer to a -subsequent AMO or LR to the same address

    -
  • -
  • -

    forbid any forwarding of a value from an AMO or SC in the store buffer -to a subsequent load to the same address

    -
  • -
  • -

    implement TSO on all memory accesses, and ignore any main memory -fences that do not include PW and SR ordering (e.g., as Ztso -implementations will do)

    -
  • -
  • -

    implement all atomics to be RCsc or even fully ordered, regardless of -annotation

    -
  • -
-
-
-

Architectures that implement RVTSO can safely do the following:

-
-
-
    -
  • -

    Ignore all fences that do not have both PW and SR (unless the fence -also orders I/O)

    -
  • -
  • -

    Ignore all PPO rules except for rules 4 through 7, since the rest -are redundant with other PPO rules under RVTSO assumptions

    -
  • -
-
-
-

Other general notes:

-
-
-
    -
  • -

    Silent stores (i.e., stores that write the same value that already -exists at a memory location) behave like any other store from a memory -model point of view. Likewise, AMOs which do not actually change the -value in memory (e.g., an AMOMAX for which the value in rs2 is smaller -than the value currently in memory) are still semantically considered -store operations. Microarchitectures that attempt to implement silent -stores must take care to ensure that the memory model is still obeyed, -particularly in cases such as RSW Section A.3.5 -which tend to be incompatible with silent stores.

    -
  • -
  • -

    Writes may be merged (i.e., two consecutive writes to the same address -may be merged) or subsumed (i.e., the earlier of two back-to-back writes -to the same address may be elided) as long as the resulting behavior -does not otherwise violate the memory model semantics.

    -
  • -
-
-
-

The question of write subsumption can be understood from the following -example:

-
- - ---- - - - - - - -
Table 48. Write subsumption litmus test, allowed execution
------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Hart 0Hart 1

li t1, 3

li t3, 2

li t2, 1

(a)

sw t1,0(s0)

(d)

lw a0,0(s1)

(b)

fence w, w

(e)

sw a0,0(s0)

(c)

sw t2,0(s1)

(f)

sw t3,0(s0)

--- - - - - - -
-
-litmus subsumption -
-
-
-

As written, if the load (d) reads value 1, then (a) must -precede (f) in the global memory order:

-
-
-
    -
  • -

    (a) precedes (c) in the global memory order because of rule 4

    -
  • -
  • -

    (c) precedes (d) in the global memory order because of the Load -Value axiom

    -
  • -
  • -

    (d) precedes (e) in the global memory order because of rule 10

    -
  • -
  • -

    (e) precedes (f) in the global memory order because of rule 1

    -
  • -
-
-
-

In other words the final value of the memory location whose address is -in s0 must be 2 (the value written by the store (f)) and -cannot be 3 (the value written by the store (a)).

-
-
-

A very aggressive microarchitecture might erroneously decide to discard -(e), as (f) supersedes it, and this may in turn lead the -microarchitecture to break the now-eliminated dependency between (d) and -(f) (and hence also between (a) and (f)). This would violate the memory -model rules, and hence it is forbidden. Write subsumption may in other -cases be legal, if for example there were no data dependency between (d) -and (e).

-
-
-

A.6.1. Possible Future Extensions

-
-

We expect that any or all of the following possible future extensions -would be compatible with the RVWMO memory model:

-
-
-
    -
  • -

    "V" vector ISA extensions

    -
  • -
  • -

    "J" JIT extension

    -
  • -
  • -

    Native encodings for load and store opcodes with aq and rl set

    -
  • -
  • -

    Fences limited to certain addresses

    -
  • -
  • -

    Cache writeback/flush/invalidate/etc.instructions

    -
  • -
-
-
-
-
-

A.7. Known Issues

-
-

A.7.1. Mixed-size RSW

- - ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 49. Mixed-size discrepancy (permitted by axiomatic models, forbidden by operational model)
Hart 0Hart 1

li t1, 1

li t1, 1

(a)

lw a0,0(s0)

(d)

lw a1,0(s1)

(b)

fence rw,rw

(e)

amoswap.w.rl a2,t1,0(s2)

(c)

sw t1,0(s1)

(f)

ld a3,0(s2)

(g)

lw a4,4(s2)

xor a5,a4,a4

add s0,s0,a5

(h)

sw t1,0(s0)

Outcome: a0=1, a1=1, a2=0, a3=1, a4=0

- - ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 50. Mixed-size discrepancy (permitted by axiomatic models, forbidden by operational model)
Hart 0Hart 1

li t1, 1

li t1, 1

(a)

lw a0,0(s0)

(d)

ld a1,0(s1)

(b)

fence rw,rw

(e)

lw a2,4(s1)

(c)

sw t1,0(s1)

xor a3,a2,a2

add s0,s0,a3

(f)

sw t1,0(s0)

Outcome: a0=1, a1=1, a2=0

- - ------ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
Table 51. Mixed-size discrepancy (permitted by axiomatic models, forbidden by operational model)
Hart 0Hart 1

li t1, 1

li t1, 1

(a)

lw a0,0(s0)

(d)

sw t1,4(s1)

(b)

fence rw,rw

(e)

ld a1,0(s1)

(c)

sw t1,0(s1)

(f)

lw a2,4(s1)

xor a3,a2,a2

add s0,s0,a3

(g)

sw t1,0(s0)

Outcome: a0=1, a1=0x100000001, a2=1

-
-

There is a known discrepancy between the operational and axiomatic -specifications within the family of mixed-size RSW variants shown in -Table 49-Table 51. -To address this, we may choose to add something like the following new -PPO rule: Memory operation a precedes memory operation -b in preserved program order (and hence also in the global -memory order) if a precedes b in program -order, a and b both access regular main -memory (rather than I/O regions), a is a load, -b is a store, there is a load m between -a and b, there is a byte x -that both a and m read, there is no store -between a and m that writes to -x, and m precedes b in PPO. In -other words, in herd syntax, we may choose to add -(po-loc & rsw);ppo;[W] to PPO. Many implementations will already -enforce this ordering naturally. As such, even though this rule is not -official, we recommend that implementers enforce it nevertheless in -order to ensure forwards compatibility with the possible future addition -of this rule to RVWMO.

-
-
-
-
-
-
-

Appendix B: Formal Memory Model Specifications, Version 0.1

-
-
-

To facilitate formal analysis of RVWMO, this chapter presents a set of -formalizations using different tools and modeling approaches. Any -discrepancies are unintended; the expectation is that the models -describe exactly the same sets of legal behaviors.

-
-
-

This appendix should be treated as commentary; all normative material is -provided in Chapter 17 and in the rest of -the main body of the ISA specification. All currently known -discrepancies are listed in -Section A.7. Any other -discrepancies are unintentional.

-
-
-

B.1. Formal Axiomatic Specification in Alloy

-
-

We present a formal specification of the RVWMO memory model in Alloy -(alloy.mit.edu). This model is available online at -github.com/daniellustig/riscv-memory-model.

-
-
-

The online material also contains some litmus tests and some examples of -how Alloy can be used to model check some of the mappings in Section A.5.

-
-
-
Listing 10. The RVWMO memory model formalized in Alloy (1/5: PPO)
-
-
// =RVWMO PPO=
-
-// Preserved Program Order
-fun ppo : Event->Event {
-  // same-address ordering
-  po_loc :> Store
-  + rdw
-  + (AMO + StoreConditional) <: rfi
-
-  // explicit synchronization
-  + ppo_fence
-  + Acquire <: ^po :> MemoryEvent
-  + MemoryEvent <: ^po :> Release
-  + RCsc <: ^po :> RCsc
-  + pair
-
-  // syntactic dependencies
-  + addrdep
-  + datadep
-  + ctrldep :> Store
-
-  // pipeline dependencies
-  + (addrdep+datadep).rfi
-  + addrdep.^po :> Store
-}
-
-// the global memory order respects preserved program order
-fact { ppo in ^gmo }
-
-
-
-
The RVWMO memory model formalized in Alloy (2/5: Axioms)
-
-
// =RVWMO axioms=
-
-// Load Value Axiom
-fun candidates[r: MemoryEvent] : set MemoryEvent {
-  (r.~^gmo & Store & same_addr[r]) // writes preceding r in gmo
-  + (r.^~po & Store & same_addr[r]) // writes preceding r in po
-}
-
-fun latest_among[s: set Event] : Event { s - s.~^gmo }
-
-pred LoadValue {
-  all w: Store | all r: Load |
-    w->r in rf <=> w = latest_among[candidates[r]]
-}
-
-// Atomicity Axiom
-pred Atomicity {
-  all r: Store.~pair |            // starting from the lr,
-    no x: Store & same_addr[r] |  // there is no store x to the same addr
-      x not in same_hart[r]       // such that x is from a different hart,
-      and x in r.~rf.^gmo         // x follows (the store r reads from) in gmo,
-      and r.pair in x.^gmo        // and r follows x in gmo
-}
-
-// Progress Axiom implicit: Alloy only considers finite executions
-
-pred RISCV_mm { LoadValue and Atomicity /* and Progress */ }
-
-
-
-
Listing 11. The RVWMO memory model formalized in Alloy (3/5: model of memory)
-
-
//Basic model of memory
-
-sig Hart {  // hardware thread
-  start : one Event
-}
-sig Address {}
-abstract sig Event {
-  po: lone Event // program order
-}
-
-abstract sig MemoryEvent extends Event {
-  address: one Address,
-  acquireRCpc: lone MemoryEvent,
-  acquireRCsc: lone MemoryEvent,
-  releaseRCpc: lone MemoryEvent,
-  releaseRCsc: lone MemoryEvent,
-  addrdep: set MemoryEvent,
-  ctrldep: set Event,
-  datadep: set MemoryEvent,
-  gmo: set MemoryEvent,  // global memory order
-  rf: set MemoryEvent
-}
-sig LoadNormal extends MemoryEvent {} // l{b|h|w|d}
-sig LoadReserve extends MemoryEvent { // lr
-  pair: lone StoreConditional
-}
-sig StoreNormal extends MemoryEvent {}       // s{b|h|w|d}
-// all StoreConditionals in the model are assumed to be successful
-sig StoreConditional extends MemoryEvent {}  // sc
-sig AMO extends MemoryEvent {}               // amo
-sig NOP extends Event {}
-
-fun Load : Event { LoadNormal + LoadReserve + AMO }
-fun Store : Event { StoreNormal + StoreConditional + AMO }
-
-sig Fence extends Event {
-  pr: lone Fence, // opcode bit
-  pw: lone Fence, // opcode bit
-  sr: lone Fence, // opcode bit
-  sw: lone Fence  // opcode bit
-}
-sig FenceTSO extends Fence {}
-
-/* Alloy encoding detail: opcode bits are either set (encoded, e.g.,
- * as f.pr in iden) or unset (f.pr not in iden).  The bits cannot be used for
- * anything else */
-fact { pr + pw + sr + sw in iden }
-// likewise for ordering annotations
-fact { acquireRCpc + acquireRCsc + releaseRCpc + releaseRCsc in iden }
-// don't try to encode FenceTSO via pr/pw/sr/sw; just use it as-is
-fact { no FenceTSO.(pr + pw + sr + sw) }
-
-
-
-
Listing 12. The RVWMO memory model formalized in Alloy (4/5: Basic model rules)
-
-
// =Basic model rules=
-
-// Ordering annotation groups
-fun Acquire : MemoryEvent { MemoryEvent.acquireRCpc + MemoryEvent.acquireRCsc }
-fun Release : MemoryEvent { MemoryEvent.releaseRCpc + MemoryEvent.releaseRCsc }
-fun RCpc : MemoryEvent { MemoryEvent.acquireRCpc + MemoryEvent.releaseRCpc }
-fun RCsc : MemoryEvent { MemoryEvent.acquireRCsc + MemoryEvent.releaseRCsc }
-
-// There is no such thing as store-acquire or load-release, unless it's both
-fact { Load & Release in Acquire }
-fact { Store & Acquire in Release }
-
-// FENCE PPO
-fun FencePRSR : Fence { Fence.(pr & sr) }
-fun FencePRSW : Fence { Fence.(pr & sw) }
-fun FencePWSR : Fence { Fence.(pw & sr) }
-fun FencePWSW : Fence { Fence.(pw & sw) }
-
-fun ppo_fence : MemoryEvent->MemoryEvent {
-    (Load  <: ^po :> FencePRSR).(^po :> Load)
-  + (Load  <: ^po :> FencePRSW).(^po :> Store)
-  + (Store <: ^po :> FencePWSR).(^po :> Load)
-  + (Store <: ^po :> FencePWSW).(^po :> Store)
-  + (Load  <: ^po :> FenceTSO) .(^po :> MemoryEvent)
-  + (Store <: ^po :> FenceTSO) .(^po :> Store)
-}
-
-// auxiliary definitions
-fun po_loc : Event->Event { ^po & address.~address }
-fun same_hart[e: Event] : set Event { e + e.^~po + e.^po }
-fun same_addr[e: Event] : set Event { e.address.~address }
-
-// initial stores
-fun NonInit : set Event { Hart.start.*po }
-fun Init : set Event { Event - NonInit }
-fact { Init in StoreNormal }
-fact { Init->(MemoryEvent & NonInit) in ^gmo }
-fact { all e: NonInit | one e.*~po.~start }  // each event is in exactly one hart
-fact { all a: Address | one Init & a.~address } // one init store per address
-fact { no Init <: po and no po :> Init }
-
-
-
-
Listing 13. The RVWMO memory model formalized in Alloy (5/5: Auxiliaries)
-
-
// po
-fact { acyclic[po] }
-
-// gmo
-fact { total[^gmo, MemoryEvent] } // gmo is a total order over all MemoryEvents
-
-//rf
-fact { rf.~rf in iden } // each read returns the value of only one write
-fact { rf in Store <: address.~address :> Load }
-fun rfi : MemoryEvent->MemoryEvent { rf & (*po + *~po) }
-
-//dep
-fact { no StoreNormal <: (addrdep + ctrldep + datadep) }
-fact { addrdep + ctrldep + datadep + pair in ^po }
-fact { datadep in datadep :> Store }
-fact { ctrldep.*po in ctrldep }
-fact { no pair & (^po :> (LoadReserve + StoreConditional)).^po }
-fact { StoreConditional in LoadReserve.pair } // assume all SCs succeed
-
-// rdw
-fun rdw : Event->Event {
-  (Load <: po_loc :> Load)  // start with all same_address load-load pairs,
-  - (~rf.rf)                // subtract pairs that read from the same store,
-  - (po_loc.rfi)            // and subtract out "fri-rfi" patterns
-}
-
-// filter out redundant instances and/or visualizations
-fact { no gmo & gmo.gmo } // keep the visualization uncluttered
-fact { all a: Address | some a.~address }
-
-// =Optional: opcode encoding restrictions=
-
-// the list of blessed fences
-fact { Fence in
-  Fence.pr.sr
-  + Fence.pw.sw
-  + Fence.pr.pw.sw
-  + Fence.pr.sr.sw
-  + FenceTSO
-  + Fence.pr.pw.sr.sw
-}
-
-pred restrict_to_current_encodings {
-  no (LoadNormal + StoreNormal) & (Acquire + Release)
-}
-
-// =Alloy shortcuts=
-pred acyclic[rel: Event->Event] { no iden & ^rel }
-pred total[rel: Event->Event, bag: Event] {
-  all disj e, e': bag | e->e' in rel + ~rel
-  acyclic[rel]
-}
-
-
-
-
-

B.2. Formal Axiomatic Specification in Herd

-
-

The tool herd takes a memory model and a litmus test as -input and simulates the execution of the test on top of the memory -model. Memory models are written in the domain specific language Cat. -This section provides two Cat memory model of RVWMO. The first model, -Listing 15, follows the global memory order, -Chapter Chapter 18, definition of RVWMO, as much -as is possible for a Cat model. The second model, -Listing 16, is an equivalent, more efficient, -partial order based RVWMO model.

-
-
-

The simulator herd is part of the diy tool -suite — see diy.inria.fr for software and documentation. The -models and more are available online at diy.inria.fr/cats7/riscv/.

-
-
-
Listing 14. riscv-defs.cat, a herd definition of preserved program order (1/3)
-
-
(*************)
-(* Utilities *)
-(*************)
-
-(* All fence relations *)
-let fence.r.r = [R];fencerel(Fence.r.r);[R]
-let fence.r.w = [R];fencerel(Fence.r.w);[W]
-let fence.r.rw = [R];fencerel(Fence.r.rw);[M]
-let fence.w.r = [W];fencerel(Fence.w.r);[R]
-let fence.w.w = [W];fencerel(Fence.w.w);[W]
-let fence.w.rw = [W];fencerel(Fence.w.rw);[M]
-let fence.rw.r = [M];fencerel(Fence.rw.r);[R]
-let fence.rw.w = [M];fencerel(Fence.rw.w);[W]
-let fence.rw.rw = [M];fencerel(Fence.rw.rw);[M]
-let fence.tso =
-  let f = fencerel(Fence.tso) in
-  ([W];f;[W]) | ([R];f;[M])
-
-let fence =
-  fence.r.r | fence.r.w | fence.r.rw |
-  fence.w.r | fence.w.w | fence.w.rw |
-  fence.rw.r | fence.rw.w | fence.rw.rw |
-  fence.tso
-
-(* Same address, no W to the same address in-between *)
-let po-loc-no-w = po-loc \ (po-loc?;[W];po-loc)
-(* Read same write *)
-let rsw = rf^-1;rf
-(* Acquire, or stronger  *)
-let AQ = Acq|AcqRel
-(* Release or stronger *)
-and RL = RelAcqRel
-(* All RCsc *)
-let RCsc = Acq|Rel|AcqRel
-(* Amo events are both R and W, relation rmw relates paired lr/sc *)
-let AMO = R & W
-let StCond = range(rmw)
-
-(*************)
-(* ppo rules *)
-(*************)
-
-(* Overlapping-Address Orderings *)
-let r1 = [M];po-loc;[W]
-and r2 = ([R];po-loc-no-w;[R]) \ rsw
-and r3 = [AMO|StCond];rfi;[R]
-(* Explicit Synchronization *)
-and r4 = fence
-and r5 = [AQ];po;[M]
-and r6 = [M];po;[RL]
-and r7 = [RCsc];po;[RCsc]
-and r8 = rmw
-(* Syntactic Dependencies *)
-and r9 = [M];addr;[M]
-and r10 = [M];data;[W]
-and r11 = [M];ctrl;[W]
-(* Pipeline Dependencies *)
-and r12 = [R];(addr|data);[W];rfi;[R]
-and r13 = [R];addr;[M];po;[W]
-
-let ppo = r1 | r2 | r3 | r4 | r5 | r6 | r7 | r8 | r9 | r10 | r11 | r12 | r13
-
-
-
-
Listing 15. riscv.cat, a herd version of the RVWMO memory model (2/3)
-
-
Total
-
-(* Notice that herd has defined its own rf relation *)
-
-(* Define ppo *)
-include "riscv-defs.cat"
-
-(********************************)
-(* Generate global memory order *)
-(********************************)
-
-let gmo0 = (* precursor: ie build gmo as an total order that include gmo0 *)
-  loc & (W\FW) * FW | # Final write after any write to the same location
-  ppo |               # ppo compatible
-  rfe                 # includes herd external rf (optimization)
-
-(* Walk over all linear extensions of gmo0 *)
-with  gmo from linearizations(M\IW,gmo0)
-
-(* Add initial writes upfront -- convenient for computing rfGMO *)
-let gmo = gmo | loc & IW * (M\IW)
-
-(**********)
-(* Axioms *)
-(**********)
-
-(* Compute rf according to the load value axiom, aka rfGMO *)
-let WR = loc & ([W];(gmo|po);[R])
-let rfGMO = WR \ (loc&([W];gmo);WR)
-
-(* Check equality of herd rf and of rfGMO *)
-empty (rf\rfGMO)|(rfGMO\rf) as RfCons
-
-(* Atomicity axiom *)
-let infloc = (gmo & loc)^-1
-let inflocext = infloc & ext
-let winside  = (infloc;rmw;inflocext) & (infloc;rf;rmw;inflocext) & [W]
-empty winside as Atomic
-
-
-
-
Listing 16. riscv.cat, an alternative herd presentation of the RVWMO memory model (3/3)
-
-
Partial
-
-(***************)
-(* Definitions *)
-(***************)
-
-(* Define ppo *)
-include "riscv-defs.cat"
-
-(* Compute coherence relation *)
-include "cos-opt.cat"
-
-(**********)
-(* Axioms *)
-(**********)
-
-(* Sc per location *)
-acyclic co|rf|fr|po-loc as Coherence
-
-(* Main model axiom *)
-acyclic co|rfe|fr|ppo as Model
-
-(* Atomicity axiom *)
-empty rmw & (fre;coe) as Atomic
-
-
-
-
-

B.3. An Operational Memory Model

-
-

This is an alternative presentation of the RVWMO memory model in -operational style. It aims to admit exactly the same extensional -behavior as the axiomatic presentation: for any given program, admitting -an execution if and only if the axiomatic presentation allows it.

-
-
-

The axiomatic presentation is defined as a predicate on complete -candidate executions. In contrast, this operational presentation has an -abstract microarchitectural flavor: it is expressed as a state machine, -with states that are an abstract representation of hardware machine -states, and with explicit out-of-order and speculative execution (but -abstracting from more implementation-specific microarchitectural details -such as register renaming, store buffers, cache hierarchies, cache -protocols, etc.). As such, it can provide useful intuition. It can also -construct executions incrementally, making it possible to interactively -and randomly explore the behavior of larger examples, while the -axiomatic model requires complete candidate executions over which the -axioms can be checked.

-
-
-

The operational presentation covers mixed-size execution, with -potentially overlapping memory accesses of different power-of-two byte -sizes. Misaligned accesses are broken up into single-byte accesses.

-
-
-

The operational model, together with a fragment of the RISC-V ISA -semantics (RV64I and A), are integrated into the rmem exploration tool -(github.com/rems-project/rmem). rmem can explore litmus tests -(see Section A.2) and small ELF binaries -exhaustively, pseudorandomly and interactively. In rmem, the ISA -semantics is expressed explicitly in Sail (see -github.com/rems-project/sail for the Sail language, and -github.com/rems-project/sail-riscv for the RISC-V ISA model), -and the concurrency semantics is expressed in Lem (see -github.com/rems-project/lem for the Lem language).

-
-
-

rmem has a command-line interface and a web-interface. The -web-interface runs entirely on the client side, and is provided online -together with a library of litmus tests: -www.cl.cam.ac.uk/. The command-line interface is -faster than the web-interface, specially in exhaustive mode.

-
-
-

Below is an informal introduction of the model states and transitions. -The description of the formal model starts in the next subsection.

-
-
-

Terminology: In contrast to the axiomatic presentation, here every -memory operation is either a load or a store. Hence, AMOs give rise to -two distinct memory operations, a load and a store. When used in -conjunction with instruction, the terms load and store refer -to instructions that give rise to such memory operations. As such, both -include AMO instructions. The term acquire refers to an instruction -(or its memory operation) with the acquire-RCpc or acquire-RCsc -annotation. The term release refers to an instruction (or its memory -operation) with the release-RCpc or release-RCsc annotation.

-
-
-

Model states

-
-
-

Model states: A model state consists of a shared memory and a tuple of hart states.

-
-
-
-Diagram -
-
-
-

The shared memory state records all the memory store operations that -have propagated so far, in the order they propagated (this can be made -more efficient, but for simplicity of the presentation we keep it this -way).

-
-
-

Each hart state consists principally of a tree of instruction instances, -some of which have been finished, and some of which have not. -Non-finished instruction instances can be subject to restart, e.g. if -they depend on an out-of-order or speculative load that turns out to be -unsound.

-
-
-

Conditional branch and indirect jump instructions may have multiple -successors in the instruction tree. When such instruction is finished, -any un-taken alternative paths are discarded.

-
-
-

Each instruction instance in the instruction tree has a state that -includes an execution state of the intra-instruction semantics (the ISA -pseudocode for this instruction). The model uses a formalization of the -intra-instruction semantics in Sail. One can think of the execution -state of an instruction as a representation of the pseudocode control -state, pseudocode call stack, and local variable values. An instruction -instance state also includes information about the instance’s memory and -register footprints, its register reads and writes, its memory -operations, whether it is finished, etc.

-
-
-

Model transitions

-
-
-

The model defines, for any model state, the set of allowed transitions, -each of which is a single atomic step to a new abstract machine state. -Execution of a single instruction will typically involve many -transitions, and they may be interleaved in operational-model execution -with transitions arising from other instructions. Each transition arises -from a single instruction instance; it will change the state of that -instance, and it may depend on or change the rest of its hart state and -the shared memory state, but it does not depend on other hart states, -and it will not change them. The transitions are introduced below and -defined in Section B.3.5, with a precondition and -a construction of the post-transition model state for each.

-
-
-

Transitions for all instructions:

-
-
-
    -
  • -

    Fetch instruction: This transition represents a fetch and decode of a new instruction instance, as a program order successor of a previously fetched -instruction instance (or the initial fetch address).

    -
  • -
-
-
-

The model assumes the instruction memory is fixed; it does not describe -the behavior of self-modifying code. In particular, the Fetch instruction transition does -not generate memory load operations, and the shared memory is not -involved in the transition. Instead, the model depends on an external -oracle that provides an opcode when given a memory location.

-
-
-
    -
  • -

    Register write: This is a write of a register value.

    -
  • -
  • -

    Register read: This is a read of a register value from the most recent -program-order-predecessor instruction instance that writes to that -register.

    -
  • -
  • -

    Pseudocode internal step: This covers pseudocode internal computation: arithmetic, function -calls, etc.

    -
  • -
  • -

    Finish instruction: At this point the instruction pseudocode is done, the instruction cannot be restarted, memory accesses cannot be discarded, and all memory -effects have taken place. For conditional branch and indirect jump -instructions, any program order successors that were fetched from an -address that is not the one that was written to the pc register are -discarded, together with the sub-tree of instruction instances below -them.

    -
  • -
-
-
-

Transitions specific to load instructions:

-
-
-
    -
  • -

    Initiate memory load operations: At this point the memory footprint of the load instruction is -provisionally known (it could change if earlier instructions are -restarted) and its individual memory load operations can start being -satisfied.

    -
  • -
-
-
- -
-
-
    -
  • -

    Complete load operations: At this point all the memory load operations of the instruction have -been entirely satisfied and the instruction pseudocode can continue -executing. A load instruction can be subject to being restarted until -the transition. But, under some conditions, the model might treat a load -instruction as non-restartable even before it is finished (e.g. see ).

    -
  • -
-
-
-

Transitions specific to store instructions:

-
-
- -
-
- -
-
-
    -
  • -

    Complete store operations: At this point all the memory store operations of the instruction -have been propagated to memory, and the instruction pseudocode can -continue executing.

    -
  • -
-
-
-

Transitions specific to sc instructions:

-
-
- -
-
-

Transitions specific to AMO instructions:

-
-
- -
-
-

Transitions specific to fence instructions:

-
-
- -
-
-

The transitions labeled stem fd03c19463adc4514951b7e42ee16e38 can always be taken eagerly, -as soon as their precondition is satisfied, without excluding other -behavior; the stem f9d6fbc7d894d9fca4a17872ba87643c cannot. Although Fetch instruction is marked with a -stem f9d6fbc7d894d9fca4a17872ba87643c, it can be taken eagerly as long as it is not -taken infinitely many times.

-
-
-

An instance of a non-AMO load instruction, after being fetched, will -typically experience the following transitions in this order:

-
- -
-

Before, between and after the transitions above, any number of -Pseudocode internal step transitions may appear. In addition, a Fetch instruction transition for fetching the -instruction in the next program location will be available until it is -taken.

-
-
-

This concludes the informal description of the operational model. The -following sections describe the formal operational model.

-
-
-

B.3.1. Intra-instruction Pseudocode Execution

-
-

The intra-instruction semantics for each instruction instance is -expressed as a state machine, essentially running the instruction -pseudocode. Given a pseudocode execution state, it computes the next -state. Most states identify a pending memory or register operation, -requested by the pseudocode, which the memory model has to do. The -states are (this is a tagged union; tags in small-caps):

-
- ---- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Load_mem(kind, address, size, load_continuation)

- memory load -operation

Early_sc_fail(res_continuation)

- allow sc to fail early

Store_ea(kind, address, size, next_state)

- memory store -effective address

Store_memv(mem_value, store_continuation)

- memory store value

Fence(kind, next_state)

- fence

Read_reg(reg_name, read_continuation)

- register read

Write_reg(reg_name, reg_value, next_state)

- register write

Internal(next_state)

- pseudocode internal step

Done

- end of pseudocode

-
-

Here:

-
-
-
    -
  • -

    mem_value and reg_value are lists of bytes;

    -
  • -
  • -

    address is an integer of XLEN bits;

    -
  • -
-
-
-

for load/store, kind identifies whether it is lr/sc, -acquire-RCpc/release-RCpc, acquire-RCsc/release-RCsc, -acquire-release-RCsc; -* for fence, kind identifies whether it is a normal or TSO, and (for -normal fences) the predecessor and successor ordering bits; -* reg_name identifies a register and a slice thereof (start and end bit -indices); and the continuations describe how the instruction instance will continue -for each value that might be provided by the surrounding memory model -(the load_continuation and read_continuation take the value loaded -from memory and read from the previous register write, the -store_continuation takes false for an sc that failed and true in -all other cases, and res_continuation takes false if the sc fails -and true otherwise).

-
-
- - - - - -
- - -
-

For example, given the load instruction lw x1,0(x2), an execution will -typically go as follows. The initial execution state will be computed -from the pseudocode for the given opcode. This can be expected to be -Read_reg(x2, read_continuation). Feeding the most recently written -value of register x2 (the instruction semantics will be blocked if -necessary until the register value is available), say 0x4000, to -read_continuation returns Load_mem(plain_load, 0x4000, 4, -load_continuation). Feeding the 4-byte value loaded from memory -location 0x4000, say 0x42, to load_continuation returns -Write_reg(x1, 0x42, Done). Many Internal(next_state) states may -appear before and between the states above.

-
-
-
-
-

Notice that writing to memory is split into two steps, Store_ea and -Store_memv: the first one makes the memory footprint of the store -provisionally known, and the second one adds the value to be stored. We -ensure these are paired in the pseudocode (Store_ea followed by -Store_memv), but there may be other steps between them.

-
-
- - - - - -
- - -
-

It is observable that the Store_ea can occur before the value to be -stored is determined. For example, for the litmus test -LB+fence.r.rw+data-po to be allowed by the operational model (as it is -by RVWMO), the first store in Hart 1 has to take the Store_ea step -before its value is determined, so that the second store can see it is -to a non-overlapping memory footprint, allowing the second store to be -committed out of order without violating coherence.

-
-
-
-
-

The pseudocode of each instruction performs at most one store or one -load, except for AMOs that perform exactly one load and one store. Those -memory accesses are then split apart into the architecturally atomic -units by the hart semantics (see Initiate memory load operations and Initiate memory store operation footprints below).

-
-
-

Informally, each bit of a register read should be satisfied from a -register write by the most recent (in program order) instruction -instance that can write that bit (or from the hart’s initial register -state if there is no such write). Hence, it is essential to know the -register write footprint of each instruction instance, which we -calculate when the instruction instance is created (see the Festch instruction action of -below). We ensure in the pseudocode that each instruction does at most -one register write to each register bit, and also that it does not try -to read a register value it just wrote.

-
-
-

Data-flow dependencies (address and data) in the model emerge from the -fact that each register read has to wait for the appropriate register -write to be executed (as described above).

-
-
-
-

B.3.2. Instruction Instance State

-
-

Each instruction instance _i has a state comprising:

-
-
-
    -
  • -

    program_loc, the memory address from which the instruction was -fetched;

    -
  • -
  • -

    instruction_kind, identifying whether this is a load, store, AMO, -fence, branch/jump or a simple instruction (this also includes a -kind similar to the one described for the pseudocode execution -states);

    -
  • -
  • -

    src_regs, the set of source _reg_name_s (including system -registers), as statically determined from the pseudocode of the -instruction;

    -
  • -
  • -

    dst_regs, the destination _reg_name_s (including system registers), -as statically determined from the pseudocode of the instruction;

    -
  • -
  • -

    pseudocode_state (or sometimes just state for short), one of (this -is a tagged union; tags in small-caps):

    -
  • -
-
- ---- - - - - - - - - - - - - - - - - -
Plain(isa_state)- ready to make a pseudocode transition

Pending_mem_loads(load_continuation)

- requesting memory load -operation(s)

Pending_mem_stores(store_continuation)

- requesting memory store -operation(s)

-
-
    -
  • -

    reg_reads, the register reads the instance has performed, including, -for each one, the register write slices it read from;

    -
  • -
  • -

    reg_writes, the register writes the instance has performed;

    -
  • -
  • -

    mem_loads, a set of memory load operations, and for each one the -as-yet-unsatisfied slices (the byte indices that have not been satisfied -yet), and, for the satisfied slices, the store slices (each consisting -of a memory store operation and subset of its byte indices) that -satisfied it.

    -
  • -
  • -

    mem_stores, a set of memory store operations, and for each one a -flag that indicates whether it has been propagated (passed to the shared -memory) or not.

    -
  • -
  • -

    information recording whether the instance is committed, finished, -etc.

    -
  • -
-
-
-

Each memory load operation includes a memory footprint (address and -size). Each memory store operations includes a memory footprint, and, -when available, a value.

-
-
-

A load instruction instance with a non-empty mem_loads, for which all -the load operations are satisfied (i.e. there are no unsatisfied load -slices) is said to be entirely satisfied.

-
-
-

Informally, an instruction instance is said to have fully determined -data if the load (and sc) instructions feeding its source registers -are finished. Similarly, it is said to have a fully determined memory -footprint if the load (and sc) instructions feeding its memory -operation address register are finished. Formally, we first define the -notion of fully determined register write: a register write -stem 7511f9c6a56927681a7d279e2d413cff from reg_writes of instruction instance -stem 6ac91b4e7dd35551c6ea477deba5f82d is said to be fully determined if one of the following -conditions hold:

-
-
-
    -
  1. -

    stem 6ac91b4e7dd35551c6ea477deba5f82d is finished; or

    -
  2. -
  3. -

    the value written by stem 7511f9c6a56927681a7d279e2d413cff is not affected by a memory -operation that stem 6ac91b4e7dd35551c6ea477deba5f82d has made (i.e. a value loaded from memory -or the result of sc), and, for every register read that -stem 6ac91b4e7dd35551c6ea477deba5f82d has made, that affects stem 7511f9c6a56927681a7d279e2d413cff, the register -write from which stem 6ac91b4e7dd35551c6ea477deba5f82d read is fully determined (or -stem 6ac91b4e7dd35551c6ea477deba5f82d read from the initial register state).

    -
  4. -
-
-
-

Now, an instruction instance stem 6ac91b4e7dd35551c6ea477deba5f82d is said to have fully -determined data if for every register read stem 6aec6bc26afaa12bc00c3daffd500eb1 from -reg_reads, the register writes that stem 6aec6bc26afaa12bc00c3daffd500eb1 reads from are -fully determined. An instruction instance stem 6ac91b4e7dd35551c6ea477deba5f82d is said to -have a fully determined memory footprint if for every register read -stem 6aec6bc26afaa12bc00c3daffd500eb1 from reg_reads that feeds into stem 6ac91b4e7dd35551c6ea477deba5f82d’s -memory operation address, the register writes that stem 6aec6bc26afaa12bc00c3daffd500eb1 reads -from are fully determined.

-
-
- - - - - -
- - -
-

The rmem tool records, for every register write, the set of register -writes from other instructions that have been read by this instruction -at the point of performing the write. By carefully arranging the -pseudocode of the instructions covered by the tool we were able to make -it so that this is exactly the set of register writes on which the write -depends on.

-
-
-
-
-
-

B.3.3. Hart State

-
-

The model state of a single hart comprises:

-
-
-
    -
  • -

    hart_id, a unique identifier of the hart;

    -
  • -
  • -

    initial_register_state, the initial register value for each -register;

    -
  • -
  • -

    initial_fetch_address, the initial instruction fetch address;

    -
  • -
  • -

    instruction_tree, a tree of the instruction instances that have been -fetched (and not discarded), in program order.

    -
  • -
-
-
-
-

B.3.4. Shared Memory State

-
-

The model state of the shared memory comprises a list of memory store -operations, in the order they propagated to the shared memory.

-
-
-

When a store operation is propagated to the shared memory it is simply -added to the end of the list. When a load operation is satisfied from -memory, for each byte of the load operation, the most recent -corresponding store slice is returned.

-
-
- - - - - -
- - -
-

For most purposes, it is simpler to think of the shared memory as an -array, i.e., a map from memory locations to memory store operation -slices, where each memory location is mapped to a one-byte slice of the -most recent memory store operation to that location. However, this -abstraction is not detailed enough to properly handle the sc -instruction. The RVWMO allows store operations from the same hart as the -sc to intervene between the store operation of the sc and the store -operations the paired lr read from. To allow such store operations to -intervene, and forbid others, the array abstraction must be extended to -record more information. Here, we use a list as it is very simple, but a -more efficient and scalable implementations should probably use -something better.

-
-
-
-
-
-

B.3.5. Transitions

-
-

Each of the paragraphs below describes a single kind of system -transition. The description starts with a condition over the current -system state. The transition can be taken in the current state only if -the condition is satisfied. The condition is followed by an action that -is applied to that state when the transition is taken, in order to -generate the new system state.

-
-
-
Fetch instruction
-
-

A possible program-order-successor of instruction instance -stem 6ac91b4e7dd35551c6ea477deba5f82d can be fetched from address loc if:

-
-
-
    -
  1. -

    it has not already been fetched, i.e., none of the immediate -successors of stem 6ac91b4e7dd35551c6ea477deba5f82d in the hart’s instruction_tree are from -loc; and

    -
  2. -
  3. -

    if stem 6ac91b4e7dd35551c6ea477deba5f82d’s pseudocode has already written an address to -pc, then loc must be that address, otherwise loc is:

    -
    -
      -
    • -

      for a conditional branch, the successor address or the branch target -address;

      -
    • -
    • -

      for a (direct) jump and link instruction (jal), the target address;

      -
    • -
    • -

      for an indirect jump instruction (jalr), any address; and

      -
    • -
    • -

      for any other instruction, stem 41660742bc942374cdffc1c2016f35fb.

      -
    • -
    -
    -
  4. -
-
-
-

Action: construct a freshly initialized instruction instance -stem fd12704256eca98f1c71f4fa1dde9dd8 for the instruction in the program memory at loc, -with state Plain(isa_state), computed from the instruction pseudocode, -including the static information available from the pseudocode such as -its instruction_kind, src_regs, and dst_regs, and add -stem fd12704256eca98f1c71f4fa1dde9dd8 to the hart’s instruction_tree as a successor of -stem 6ac91b4e7dd35551c6ea477deba5f82d.

-
-
-

The possible next fetch addresses (loc) are available immediately -after fetching stem 6ac91b4e7dd35551c6ea477deba5f82d and the model does not need to wait for -the pseudocode to write to pc; this allows out-of-order execution, and -speculation past conditional branches and jumps. For most instructions -these addresses are easily obtained from the instruction pseudocode. The -only exception to that is the indirect jump instruction (jalr), where -the address depends on the value held in a register. In principle the -mathematical model should allow speculation to arbitrary addresses here. -The exhaustive search in the rmem tool handles this by running the -exhaustive search multiple times with a growing set of possible next -fetch addresses for each indirect jump. The initial search uses empty -sets, hence there is no fetch after indirect jump instruction until the -pseudocode of the instruction writes to pc, and then we use that value -for fetching the next instruction. Before starting the next iteration of -exhaustive search, we collect for each indirect jump (grouped by code -location) the set of values it wrote to pc in all the executions in -the previous search iteration, and use that as possible next fetch -addresses of the instruction. This process terminates when no new fetch -addresses are detected.

-
-
-
-
Initiate memory load operations
-
-

An instruction instance stem 6ac91b4e7dd35551c6ea477deba5f82d in state Plain(Load_mem(kind, -address, size, load_continuation)) can always initiate the -corresponding memory load operations. Action:

-
-
-
    -
  1. -

    Construct the appropriate memory load operations stem 87e245174d1300073c38508ca7876c6f:

    -
    -
      -
    • -

      if address is aligned to size then stem 87e245174d1300073c38508ca7876c6f is a single -memory load operation of size bytes from address;

      -
    • -
    • -

      otherwise, stem 87e245174d1300073c38508ca7876c6f is a set of size memory load -operations, each of one byte, from the addresses -stem f82c08e4a3f84915b619aa6472e0f810.

      -
    • -
    -
    -
  2. -
  3. -

    set mem_loads of stem 6ac91b4e7dd35551c6ea477deba5f82d to stem 87e245174d1300073c38508ca7876c6f; and

    -
  4. -
  5. -

    update the state of stem 6ac91b4e7dd35551c6ea477deba5f82d to -Pending_mem_loads(load_continuation).

    -
  6. -
-
-
-
-
-

In Section 18.1.1 it is said that -misaligned memory accesses may be decomposed at any granularity. Here we -decompose them to one-byte accesses as this granularity subsumes all -others.

-
-
-
-
-
-
Satisfy memory load operation by forwarding from unpropagated stores
-
-

For a non-AMO load instruction instance stem 6ac91b4e7dd35551c6ea477deba5f82d in state -Pending_mem_loads(load_continuation), and a memory load operation -stem f1903748b6bb79515f22670686a6d90c in stem a6b8a0ea6d0d3d30d7c26a73378ca550 that has -unsatisfied slices, the memory load operation can be partially or -entirely satisfied by forwarding from unpropagated memory store -operations by store instruction instances that are program-order-before -stem 6ac91b4e7dd35551c6ea477deba5f82d if:

-
-
-
    -
  1. -

    all program-order-previous fence instructions with .sr and .pw -set are finished;

    -
  2. -
  3. -

    for every program-order-previous fence instruction, stem f9ab899994f3d644a9c2ab98a38de0c6, -with .sr and .pr set, and .pw not set, if stem f9ab899994f3d644a9c2ab98a38de0c6 is not -finished then all load instructions that are program-order-before -stem f9ab899994f3d644a9c2ab98a38de0c6 are entirely satisfied;

    -
  4. -
  5. -

    for every program-order-previous fence.tso instruction, -stem f9ab899994f3d644a9c2ab98a38de0c6, that is not finished, all load instructions that are -program-order-before stem f9ab899994f3d644a9c2ab98a38de0c6 are entirely satisfied;

    -
  6. -
  7. -

    if stem 6ac91b4e7dd35551c6ea477deba5f82d is a load-acquire-RCsc, all program-order-previous -store-releases-RCsc are finished;

    -
  8. -
  9. -

    if stem 6ac91b4e7dd35551c6ea477deba5f82d is a load-acquire-release, all -program-order-previous instructions are finished;

    -
  10. -
  11. -

    all non-finished program-order-previous load-acquire instructions are -entirely satisfied; and

    -
  12. -
  13. -

    all program-order-previous store-acquire-release instructions are -finished;

    -
  14. -
-
-
-

Let stem 082668fc86f03352833a0820f08bf3cc be the set of all unpropagated memory store -operation slices from non-sc store instruction instances that are -program-order-before stem 6ac91b4e7dd35551c6ea477deba5f82d and have already calculated the -value to be stored, that overlap with the unsatisfied slices of -stem f1903748b6bb79515f22670686a6d90c, and which are not superseded by intervening store -operations or store operations that are read from by an intervening -load. The last condition requires, for each memory store operation slice -stem f85095907f5d98a9545a2a897907c982 in stem 082668fc86f03352833a0820f08bf3cc from instruction -stem fd12704256eca98f1c71f4fa1dde9dd8:

-
-
-
    -
  • -

    that there is no store instruction program-order-between stem 6ac91b4e7dd35551c6ea477deba5f82d -and stem fd12704256eca98f1c71f4fa1dde9dd8 with a memory store operation overlapping -stem f85095907f5d98a9545a2a897907c982; and

    -
  • -
  • -

    that there is no load instruction program-order-between stem 6ac91b4e7dd35551c6ea477deba5f82d -and stem fd12704256eca98f1c71f4fa1dde9dd8 that was satisfied from an overlapping memory store -operation slice from a different hart.

    -
  • -
-
-
-

Action:

-
-
-
    -
  1. -

    update stem a6b8a0ea6d0d3d30d7c26a73378ca550 to indicate that -stem f1903748b6bb79515f22670686a6d90c was satisfied by stem 082668fc86f03352833a0820f08bf3cc; and

    -
  2. -
  3. -

    restart any speculative instructions which have violated coherence as -a result of this, i.e., for every non-finished instruction -stem fd12704256eca98f1c71f4fa1dde9dd8 that is a program-order-successor of stem 6ac91b4e7dd35551c6ea477deba5f82d, -and every memory load operation stem 8571a8a5d097d63350efc6d8da824055 of stem fd12704256eca98f1c71f4fa1dde9dd8 -that was satisfied from stem 1344dde66edf219ff9e7873942a34a33, if there exists a memory -store operation slice stem 80b56f226b89976549d536e6d2967ed7 in stem 1344dde66edf219ff9e7873942a34a33, and -an overlapping memory store operation slice from a different memory -store operation in stem 082668fc86f03352833a0820f08bf3cc, and stem 80b56f226b89976549d536e6d2967ed7 is not -from an instruction that is a program-order-successor of -stem 6ac91b4e7dd35551c6ea477deba5f82d, restart stem fd12704256eca98f1c71f4fa1dde9dd8 and its restart-dependents.

    -
  4. -
-
-
-

Where, the restart-dependents of instruction stem 7d6abfd7a8903d8827b35524704fd563 are:

-
-
-
    -
  • -

    program-order-successors of stem 7d6abfd7a8903d8827b35524704fd563 that have data-flow -dependency on a register write of stem 7d6abfd7a8903d8827b35524704fd563;

    -
  • -
  • -

    program-order-successors of stem 7d6abfd7a8903d8827b35524704fd563 that have a memory load -operation that reads from a memory store operation of stem 7d6abfd7a8903d8827b35524704fd563 -(by forwarding);

    -
  • -
  • -

    if stem 7d6abfd7a8903d8827b35524704fd563 is a load-acquire, all the program-order-successors -of stem 7d6abfd7a8903d8827b35524704fd563;

    -
  • -
  • -

    if stem 7d6abfd7a8903d8827b35524704fd563 is a load, for every fence, stem f9ab899994f3d644a9c2ab98a38de0c6, with -.sr and .pr set, and .pw not set, that is a -program-order-successor of stem 7d6abfd7a8903d8827b35524704fd563, all the load instructions -that are program-order-successors of stem f9ab899994f3d644a9c2ab98a38de0c6;

    -
  • -
  • -

    if stem 7d6abfd7a8903d8827b35524704fd563 is a load, for every fence.tso, stem f9ab899994f3d644a9c2ab98a38de0c6, -that is a program-order-successor of stem 7d6abfd7a8903d8827b35524704fd563, all the load -instructions that are program-order-successors of stem f9ab899994f3d644a9c2ab98a38de0c6; and

    -
  • -
  • -

    (recursively) all the restart-dependents of all the instruction -instances above.

    -
  • -
-
-
-
-
-

Forwarding memory store operations to a memory load might satisfy only -some slices of the load, leaving other slices unsatisfied.

-
-
-

A program-order-previous store operation that was not available when -taking the transition above might make stem 082668fc86f03352833a0820f08bf3cc provisionally -unsound (violating coherence) when it becomes available. That store will -prevent the load from being finished (see Finish instruction), and will cause it to -restart when that store operation is propagated (see Propagate store operation).

-
-
-

A consequence of the transition condition above is that -store-release-RCsc memory store operations cannot be forwarded to -load-acquire-RCsc instructions: stem 082668fc86f03352833a0820f08bf3cc does not include -memory store operations from finished stores (as those must be -propagated memory store operations), and the condition above requires -all program-order-previous store-releases-RCsc to be finished when the -load is acquire-RCsc.

-
-
-
-
-
-
Satisfy memory load operation from memory
-
-

For an instruction instance stem 6ac91b4e7dd35551c6ea477deba5f82d of a non-AMO load -instruction or an AMO instruction in the context of the Saitsfy, commit and propagate operations of an AMO transition, -any memory load operation stem f1903748b6bb79515f22670686a6d90c in -stem a6b8a0ea6d0d3d30d7c26a73378ca550 that has unsatisfied slices, can be -satisfied from memory if all the conditions of <sat_by_forwarding, Saitsfy memory load operation by forwarding from unpropagated stores>> are satisfied. Action: -let stem 082668fc86f03352833a0820f08bf3cc be the memory store operation slices from memory -covering the unsatisfied slices of stem f1903748b6bb79515f22670686a6d90c, and apply the -action of Satisfy memory operation by forwarding from unpropagates stores.

-
-
- - - - - -
- - -
-

Note that Satisfy memory operation by forwarding from unpropagates stores might leave some slices of the memory load operation -unsatisfied, those will have to be satisfied by taking the transition -again, or taking Satisfy memory load operation from memory. Satisfy memory load operation from memory, on the other hand, will always satisfy all the -unsatisfied slices of the memory load operation.

-
-
-
-
-
-
Complete load operations
-
-

A load instruction instance stem 6ac91b4e7dd35551c6ea477deba5f82d in state -Pending_mem_loads(load_continuation) can be completed (not to be -confused with finished) if all the memory load operations -stem a6b8a0ea6d0d3d30d7c26a73378ca550 are entirely satisfied (i.e. there -are no unsatisfied slices). Action: update the state of stem 6ac91b4e7dd35551c6ea477deba5f82d -to Plain(load_continuation(mem_value)), where mem_value is assembled -from all the memory store operation slices that satisfied -stem a6b8a0ea6d0d3d30d7c26a73378ca550.

-
-
-
-
Early sc fail
-
-

An sc instruction instance stem 6ac91b4e7dd35551c6ea477deba5f82d in state -Plain(Early_sc_fail(res_continuation)) can always be made to fail. -Action: update the state of stem 6ac91b4e7dd35551c6ea477deba5f82d to -Plain(res_continuation(false)).

-
-
-
-
Paired sc
-
-

An sc instruction instance stem 6ac91b4e7dd35551c6ea477deba5f82d in state -Plain(Early_sc_fail(res_continuation)) can continue its (potentially -successful) execution if stem 6ac91b4e7dd35551c6ea477deba5f82d is paired with an lr. Action: -update the state of stem 6ac91b4e7dd35551c6ea477deba5f82d to Plain(res_continuation(true)).

-
-
-
-
Initiate memory store operation footprints
-
-

An instruction instance stem 6ac91b4e7dd35551c6ea477deba5f82d in state Plain(Store_ea(kind, -address, size, next_state)) can always announce its pending memory -store operation footprint. Action:

-
-
-
    -
  1. -

    construct the appropriate memory store operations stem f85095907f5d98a9545a2a897907c982 -(without the store value):

    -
    -
      -
    • -

      if address is aligned to size then stem f85095907f5d98a9545a2a897907c982 is a single -memory store operation of size bytes to address;

      -
    • -
    • -

      otherwise, stem f85095907f5d98a9545a2a897907c982 is a set of size memory store -operations, each of one-byte size, to the addresses -stem f82c08e4a3f84915b619aa6472e0f810.

      -
    • -
    -
    -
  2. -
  3. -

    set stem a9642b9ca23ba78275edb15d213c9b95 to stem f85095907f5d98a9545a2a897907c982; and

    -
  4. -
  5. -

    update the state of stem 6ac91b4e7dd35551c6ea477deba5f82d to Plain(next_state).

    -
  6. -
-
-
-
-
-

Note that after taking the transition above the memory store operations -do not yet have their values. The importance of splitting this -transition from the transition below is that it allows other -program-order-successor store instructions to observe the memory -footprint of this instruction, and if they don’t overlap, propagate out -of order as early as possible (i.e. before the data register value -becomes available).

-
-
-
-
-
-
Instantiate memory store operation values
-
-

An instruction instance stem 6ac91b4e7dd35551c6ea477deba5f82d in state -Plain(Store_memv(mem_value, store_continuation)) can always -instantiate the values of the memory store operations -stem a9642b9ca23ba78275edb15d213c9b95. Action:

-
-
-
    -
  1. -

    split mem_value between the memory store operations -stem a9642b9ca23ba78275edb15d213c9b95; and

    -
  2. -
  3. -

    update the state of stem 6ac91b4e7dd35551c6ea477deba5f82d to -Pending_mem_stores(store_continuation).

    -
  4. -
-
-
-
-
Commit store instruction
-
-

An uncommitted instruction instance stem 6ac91b4e7dd35551c6ea477deba5f82d of a non-sc store -instruction or an sc instruction in the context of the Commit and propagate store operation of an sc -transition, in state Pending_mem_stores(store_continuation), can be -committed (not to be confused with propagated) if:

-
-
-
    -
  1. -

    stem 6ac91b4e7dd35551c6ea477deba5f82d has fully determined data;

    -
  2. -
  3. -

    all program-order-previous conditional branch and indirect jump -instructions are finished;

    -
  4. -
  5. -

    all program-order-previous fence instructions with .sw set are -finished;

    -
  6. -
  7. -

    all program-order-previous fence.tso instructions are finished;

    -
  8. -
  9. -

    all program-order-previous load-acquire instructions are finished;

    -
  10. -
  11. -

    all program-order-previous store-acquire-release instructions are -finished;

    -
  12. -
  13. -

    if stem 6ac91b4e7dd35551c6ea477deba5f82d is a store-release, all program-order-previous -instructions are finished;

    -
  14. -
  15. -

    all program-order-previous memory access instructions have a fully -determined memory footprint;

    -
  16. -
  17. -

    all program-order-previous store instructions, except for sc that failed, -have initiated and so have non-empty mem_stores; and

    -
  18. -
  19. -

    all program-order-previous load instructions have initiated and so have -non-empty mem_loads.

    -
  20. -
-
-
-

Action: record that i is committed.

-
-
- - - - - -
- - -
-

Notice that if condition -8 is satisfied -the conditions -9 and -10 are also -satisfied, or will be satisfied after taking some eager transitions. -Hence, requiring them does not strengthen the model. By requiring them, -we guarantee that previous memory access instructions have taken enough -transitions to make their memory operations visible for the condition -check of , which is the next transition the instruction will take, -making that condition simpler.

-
-
-
-
-
-
Propagate store operation
-
-

For a committed instruction instance stem 6ac91b4e7dd35551c6ea477deba5f82d in state -Pending_mem_stores(store_continuation), and an unpropagated memory -store operation stem b406cb52bbdcb9c7799e21d181e685c1 in -stem a9642b9ca23ba78275edb15d213c9b95, stem b406cb52bbdcb9c7799e21d181e685c1 can be -propagated if:

-
-
-
    -
  1. -

    all memory store operations of program-order-previous store -instructions that overlap with stem b406cb52bbdcb9c7799e21d181e685c1 have already -propagated;

    -
  2. -
  3. -

    all memory load operations of program-order-previous load instructions -that overlap with stem b406cb52bbdcb9c7799e21d181e685c1 have already been satisfied, and -(the load instructions) are non-restartable (see definition below); -and

    -
  4. -
  5. -

    all memory load operations that were satisfied by forwarding -stem b406cb52bbdcb9c7799e21d181e685c1 are entirely satisfied.

    -
  6. -
-
-
-

Where a non-finished instruction instance stem 7d6abfd7a8903d8827b35524704fd563 is -non-restartable if:

-
-
-
    -
  1. -

    there does not exist a store instruction stem 0d6d0a3f5bcc5a27d3ba5cf91524b5a4 and an -unpropagated memory store operation stem b406cb52bbdcb9c7799e21d181e685c1 of stem 0d6d0a3f5bcc5a27d3ba5cf91524b5a4 -such that applying the action of the Propagate store operation transition to -stem b406cb52bbdcb9c7799e21d181e685c1 will result in the restart of stem 7d6abfd7a8903d8827b35524704fd563; and

    -
  2. -
  3. -

    there does not exist a non-finished load instruction stem 65b1894f130a02fb32beeead9ca75d74 -and a memory load operation stem f1903748b6bb79515f22670686a6d90c of stem 65b1894f130a02fb32beeead9ca75d74 such -that applying the action of the Satisfy memory load operation by forwarding from unpropagated stores/Satisfy memory load operation from memory transition (even if -stem f1903748b6bb79515f22670686a6d90c is already satisfied) to stem f1903748b6bb79515f22670686a6d90c will result -in the restart of stem 7d6abfd7a8903d8827b35524704fd563.

    -
  4. -
-
-
-

Action:

-
-
-
    -
  1. -

    update the shared memory state with stem b406cb52bbdcb9c7799e21d181e685c1;

    -
  2. -
  3. -

    update stem a9642b9ca23ba78275edb15d213c9b95 to indicate that -stem b406cb52bbdcb9c7799e21d181e685c1 was propagated; and

    -
  4. -
  5. -

    restart any speculative instructions which have violated coherence as -a result of this, i.e., for every non-finished instruction -stem fd12704256eca98f1c71f4fa1dde9dd8 program-order-after stem 6ac91b4e7dd35551c6ea477deba5f82d and every memory -load operation stem 8571a8a5d097d63350efc6d8da824055 of stem fd12704256eca98f1c71f4fa1dde9dd8 that was satisfied -from stem 1344dde66edf219ff9e7873942a34a33, if there exists a memory store operation -slice stem 80b56f226b89976549d536e6d2967ed7 in stem 1344dde66edf219ff9e7873942a34a33 that overlaps with -stem b406cb52bbdcb9c7799e21d181e685c1 and is not from stem b406cb52bbdcb9c7799e21d181e685c1, and -stem 80b56f226b89976549d536e6d2967ed7 is not from a program-order-successor of -stem 6ac91b4e7dd35551c6ea477deba5f82d, restart stem fd12704256eca98f1c71f4fa1dde9dd8 and its restart-dependents -(see Satisfy memory load operation by forwarding from unpropagated stores).

    -
  6. -
-
-
-
-
Commit and propagate store operation of an sc
-
-

An uncommitted sc instruction instance stem 6ac91b4e7dd35551c6ea477deba5f82d, from hart -stem af98b3d273f2fa50eec5140dd48d1eae, in state Pending_mem_stores(store_continuation), with -a paired lr stem fd12704256eca98f1c71f4fa1dde9dd8 that has been satisfied by some store -slices stem 082668fc86f03352833a0820f08bf3cc, can be committed and propagated at the same -time if:

-
-
-
    -
  1. -

    stem fd12704256eca98f1c71f4fa1dde9dd8 is finished;

    -
  2. -
  3. -

    every memory store operation that has been forwarded to -stem fd12704256eca98f1c71f4fa1dde9dd8 is propagated;

    -
  4. -
  5. -

    the conditions of Commit store instruction is satisfied;

    -
  6. -
  7. -

    the conditions of Propagate store instruction is satisfied (notice that an sc instruction can -only have one memory store operation); and

    -
  8. -
  9. -

    for every store slice stem f85095907f5d98a9545a2a897907c982 from stem 082668fc86f03352833a0820f08bf3cc, -stem f85095907f5d98a9545a2a897907c982 has not been overwritten, in the shared memory, by a -store that is from a hart that is not stem af98b3d273f2fa50eec5140dd48d1eae, at any point -since stem f85095907f5d98a9545a2a897907c982 was propagated to memory.

    -
  10. -
-
-
-

Action:

-
-
-
    -
  1. -

    apply the actions of Commit store instruction; and

    -
  2. -
  3. -

    apply the action of Propagate store instruction.

    -
  4. -
-
-
-
-
Late sc fail
-
-

An sc instruction instance stem 6ac91b4e7dd35551c6ea477deba5f82d in state -Pending_mem_stores(store_continuation), that has not propagated its -memory store operation, can always be made to fail. Action:

-
-
-
    -
  1. -

    clear stem a9642b9ca23ba78275edb15d213c9b95; and

    -
  2. -
  3. -

    update the state of stem 6ac91b4e7dd35551c6ea477deba5f82d to -Plain(store_continuation(false)).

    -
  4. -
-
-
-
-
-

For efficiency, the rmem tool allows this transition only when it is -not possible to take the Commit and propagate store operation of an sc transition. This does not affect the set of -allowed final states, but when explored interactively, if the sc -should fail one should use the Eaarly sc fail transition instead of waiting for this transition.

-
-
-
-
-
-
Complete store operations
-
-

A store instruction instance stem 6ac91b4e7dd35551c6ea477deba5f82d in state -Pending_mem_stores(store_continuation), for which all the memory store -operations in stem a9642b9ca23ba78275edb15d213c9b95 have been propagated, -can always be completed (not to be confused with finished). Action: -update the state of stem 6ac91b4e7dd35551c6ea477deba5f82d to -Plain(store_continuation(true)).

-
-
-
-
Satisfy, commit and propagate operations of an AMO
-
-

An AMO instruction instance stem 6ac91b4e7dd35551c6ea477deba5f82d in state -Pending_mem_loads(load_continuation) can perform its memory access if -it is possible to perform the following sequence of transitions with no -intervening transitions:

-
- -
-

and in addition, the condition of Finish instruction, with the exception of not requiring -stem 6ac91b4e7dd35551c6ea477deba5f82d to be in state Plain(Done), holds after those -transitions. Action: perform the above sequence of transitions (this -does not include Finish instruction), one after the other, with no intervening -transitions.

-
-
- - - - - -
- - -
-

Notice that program-order-previous stores cannot be forwarded to the -load of an AMO. This is simply because the sequence of transitions above -does not include the forwarding transition. But even if it did include -it, the sequence will fail when trying to do the Propagate store operation transition, as this -transition requires all program-order-previous store operations to -overlapping memory footprints to be propagated, and forwarding requires -the store operation to be unpropagated.

-
-
-

In addition, the store of an AMO cannot be forwarded to a -program-order-successor load. Before taking the transition above, the -store operation of the AMO does not have its value and therefore cannot -be forwarded; after taking the transition above the store operation is -propagated and therefore cannot be forwarded.

-
-
-
-
-
-
Commit fence
-
-

A fence instruction instance stem 6ac91b4e7dd35551c6ea477deba5f82d in state -Plain(Fence(kind, next_state)) can be committed if:

-
-
-
    -
  1. -

    if stem 6ac91b4e7dd35551c6ea477deba5f82d is a normal fence and it has .pr set, all -program-order-previous load instructions are finished;

    -
  2. -
  3. -

    if stem 6ac91b4e7dd35551c6ea477deba5f82d is a normal fence and it has .pw set, all -program-order-previous store instructions are finished; and

    -
  4. -
  5. -

    if stem 6ac91b4e7dd35551c6ea477deba5f82d is a fence.tso, all program-order-previous load -and store instructions are finished.

    -
  6. -
-
-
-

Action:

-
-
-
    -
  1. -

    record that stem 6ac91b4e7dd35551c6ea477deba5f82d is committed; and

    -
  2. -
  3. -

    update the state of stem 6ac91b4e7dd35551c6ea477deba5f82d to Plain(next_state).

    -
  4. -
-
-
-
-
Register read
-
-

An instruction instance stem 6ac91b4e7dd35551c6ea477deba5f82d in state -Plain(Read_reg(reg_name, read_cont)) can do a register read of -reg_name if every instruction instance that it needs to read from has -already performed the expected reg_name register write.

-
-
-

Let read_sources include, for each bit of reg_name, the write to -that bit by the most recent (in program order) instruction instance that -can write to that bit, if any. If there is no such instruction, the -source is the initial register value from initial_register_state. Let -reg_value be the value assembled from read_sources. Action:

-
-
-
    -
  1. -

    add reg_name to stem e4c7338ef90c20ab9f1e25877c063865 with -read_sources and reg_value; and

    -
  2. -
  3. -

    update the state of stem 6ac91b4e7dd35551c6ea477deba5f82d to Plain(read_cont(reg_value)).

    -
  4. -
-
-
-
-
Register write
-
-

An instruction instance stem 6ac91b4e7dd35551c6ea477deba5f82d in state -Plain(Write_reg(reg_name, reg_value, next_state)) can always do a -reg_name register write. Action:

-
-
-
    -
  1. -

    add reg_name to stem 106f9ebc134bdfd3291c4c84455ac2db with -stem 6dc98c92215c61f2018efb3df23af251 and reg_value; and

    -
  2. -
  3. -

    update the state of stem 6ac91b4e7dd35551c6ea477deba5f82d to Plain(next_state).

    -
  4. -
-
-
-

where stem 6dc98c92215c61f2018efb3df23af251 is a pair of the set of all read_sources from -stem e4c7338ef90c20ab9f1e25877c063865, and a flag that is true iff -stem 6ac91b4e7dd35551c6ea477deba5f82d is a load instruction instance that has already been -entirely satisfied.

-
-
-
-
Pseudocode internal step
-
-

An instruction instance stem 6ac91b4e7dd35551c6ea477deba5f82d in state -Plain(Internal(next_state)) can always do that pseudocode-internal -step. Action: update the state of stem 6ac91b4e7dd35551c6ea477deba5f82d to -Plain(next_state).

-
-
-
-
Finish instruction
-
-

A non-finished instruction instance stem 6ac91b4e7dd35551c6ea477deba5f82d in state Plain(Done) -can be finished if:

-
-
-
    -
  1. -

    if stem 6ac91b4e7dd35551c6ea477deba5f82d is a load instruction:

    -
    -
      -
    1. -

      all program-order-previous load-acquire instructions are finished;

      -
    2. -
    3. -

      all program-order-previous fence instructions with .sr set are -finished;

      -
    4. -
    5. -

      for every program-order-previous fence.tso instruction, -stem f9ab899994f3d644a9c2ab98a38de0c6, that is not finished, all load instructions that are -program-order-before stem f9ab899994f3d644a9c2ab98a38de0c6 are finished; and

      -
    6. -
    7. -

      it is guaranteed that the values read by the memory load operations -of stem 6ac91b4e7dd35551c6ea477deba5f82d will not cause coherence violations, i.e., for any -program-order-previous instruction instance stem fd12704256eca98f1c71f4fa1dde9dd8, let -stem 6863bf3b75317b35ee75c32d6a757974 be the combined footprint of propagated -memory store operations from store instructions program-order-between -stem 6ac91b4e7dd35551c6ea477deba5f82d and stem fd12704256eca98f1c71f4fa1dde9dd8, and fixed memory store -operations that were forwarded to stem 6ac91b4e7dd35551c6ea477deba5f82d from store -instructions program-order-between stem 6ac91b4e7dd35551c6ea477deba5f82d and stem fd12704256eca98f1c71f4fa1dde9dd8 -including stem fd12704256eca98f1c71f4fa1dde9dd8, and let -stem f07d30fa3303b30b878e98038c45b996 be the complement of -stem 6863bf3b75317b35ee75c32d6a757974 in the memory footprint of stem 6ac91b4e7dd35551c6ea477deba5f82d. -If stem f07d30fa3303b30b878e98038c45b996 is not empty:

      -
      -
        -
      1. -

        stem fd12704256eca98f1c71f4fa1dde9dd8 has a fully determined memory footprint;

        -
      2. -
      3. -

        stem fd12704256eca98f1c71f4fa1dde9dd8 has no unpropagated memory store operations that -overlap with stem f07d30fa3303b30b878e98038c45b996; and

        -
      4. -
      5. -

        if stem fd12704256eca98f1c71f4fa1dde9dd8 is a load with a memory footprint that overlaps -with stem f07d30fa3303b30b878e98038c45b996, then all the memory load -operations of stem fd12704256eca98f1c71f4fa1dde9dd8 that overlap with -stem f07d30fa3303b30b878e98038c45b996 are satisfied and stem fd12704256eca98f1c71f4fa1dde9dd8 -is non-restartable (see the Propagate store operation transition for how to determined if an -instruction is non-restartable).

        -
        -

        Here, a memory store operation is called fixed if the store instruction -has fully determined data.

        -
        -
      6. -
      -
      -
    8. -
    -
    -
  2. -
  3. -

    stem 6ac91b4e7dd35551c6ea477deba5f82d has a fully determined data; and

    -
  4. -
  5. -

    if stem 6ac91b4e7dd35551c6ea477deba5f82d is not a fence, all program-order-previous -conditional branch and indirect jump instructions are finished.

    -
  6. -
-
-
-

Action:

-
-
-
    -
  1. -

    if stem 6ac91b4e7dd35551c6ea477deba5f82d is a conditional branch or indirect jump -instruction, discard any untaken paths of execution, i.e., remove all -instruction instances that are not reachable by the branch/jump taken in -instruction_tree; and

    -
  2. -
  3. -

    record the instruction as finished, i.e., set finished to true.

    -
  4. -
-
-
-
-
-

B.3.6. Limitations

-
-
    -
  • -

    The model covers user-level RV64I and RV64A. In particular, it does -not support the misaligned atomicity granule PMA or the total store -ordering extension "Ztso". It should be trivial to adapt the model to -RV32I/A and to the G, Q and C extensions, but we have never tried it. -This will involve, mostly, writing Sail code for the instructions, with -minimal, if any, changes to the concurrency model.

    -
  • -
  • -

    The model covers only normal memory accesses (it does not handle I/O -accesses).

    -
  • -
  • -

    The model does not cover TLB-related effects.

    -
  • -
  • -

    The model assumes the instruction memory is fixed. In particular, the -Fetch instruction transition does not generate memory load operations, and the shared -memory is not involved in the transition. Instead, the model depends on -an external oracle that provides an opcode when given a memory location.

    -
  • -
  • -

    The model does not cover exceptions, traps and interrupts.

    -
  • -
-
-
-
-
-
-
-

Appendix C: Vector Assembly Code Examples

-
- -
-
-
-

Appendix D: Calling Convention for Vector State (Not authoritative - Placeholder Only)

-
- -
-
-
-

Index

-
- -
-
-
-

Bibliography

-
-
-

RISC-V ELF psABI Specification. github.com/riscv/riscv-elf-psabi-doc/ .

-
-
-

RISC-V Assembly Programmer’s Manual. github.com/riscv/riscv-asm-manual .

-
-
-

IEEE Standard for a 32-bit microprocessor. (1994). IEEE Std. 1754-1994.

-
-
-

ANSI/IEEE Std 754-2008, IEEE standard for floating-point arithmetic. (2008). "Institute of Electrical and Electronic Engineers".

-
-
-

Amdahl, G. M., Blaauw, G. A., & F. P. Brooks, J. (1964). Architecture of the IBM System/360. IBM Journal of R. & D., 8(2).

-
-
-

Buchholz, W. (1962). Planning a computer system: Project Stretch. McGraw-Hill Book Company.

-
-
-

Heil, T. H., & Smith, J. E. (1996). Selective Dual Path Execution. University of Wisconsin - Madison.

-
-
-

Katevenis, M. G. H., Sherburne, R. W., Jr., Patterson, D. A., & Séquin, C. H. (1983, August). The RISC II micro-architecture. Proceedings VLSI 83 Conference.

-
-
-

Kim, H., Mutlu, O., Stark, J., & Patt, Y. N. (2005). Wish Branches: Combining Conditional Branching and Predication for Adaptive Predicated Execution. Proceedings of the 38th Annual IEEE/ACM International Symposium on Microarchitecture, 43–54.

-
-
-

Klauser, A., Austin, T., Grunwald, D., & Calder, B. (1998). Dynamic Hammock Predication for Non-Predicated Instruction Set Architectures. Proceedings of the 1998 International Conference on Parallel Architectures and Compilation Techniques.

-
-
-

Lee, D. D., Kong, S. I., Hill, M. D., Taylor, G. S., Hodges, D. A., Katz, R. H., & Patterson, D. A. (1989). A VLSI Chip Set for a Multiprocessor Workstation–Part I: An RISC Microprocessor with Coprocessor Interface and Support for Symbolic Processing. IEEE JSSC, 24(6), 1688–1698.

-
-
-

OpenCores. (2012). OpenRISC 1000 Architecture Manual, Architecture Version 1.0.

-
-
-

Pan, H., Hindman, B., & Asanović, K. (2009, March). Lithe: Enabling Efficient Composition of Parallel Libraries. Proceedings of the 1st USENIX Workshop on Hot Topics in Parallelism (HotPar ’09).

-
-
-

Pan, H., Hindman, B., & Asanović, K. (2010, June). Composing Parallel Software Efficiently with Lithe. 31st Conference on Programming Language Design and Implementation.

-
-
-

Patterson, D. A., & Séquin, C. H. (1981). RISC I: A Reduced Instruction Set VLSI Computer. ISCA, 443–458.

-
-
-

Sinharoy, B., Kalla, R., Starke, W. J., Le, H. Q., Cargnoni, R., Van Norstrand, J. A., Ronchetti, B. J., Stuecheli, J., Leenstra, J., Guthrie, G. L., Nguyen, D. Q., Blaner, B., Marino, C. F., Retter, E., & Williams, P. (2011). IBM POWER7 multicore server processor. IBM Journal of Research and Development, 55(3), 1–1.

-
-
-

Thornton, J. E. (1965). Parallel Operation in the Control Data 6600. Proceedings of the October 27-29, 1964, Fall Joint Computer Conference, Part II: Very High Speed Computer Systems, 33–40.

-
-
-

Tseng, J., & Asanović, K. (2000). Energy-Efficient Register Access. Proc. of the 13th Symposium on Integrated Circuits and Systems Design, 377–384.

-
-
-

Ungar, D., Blau, R., Foley, P., Samples, D., & Patterson, D. (1984). Architecture of SOAR: Smalltalk on a RISC. ISCA, 188–197.

-
-
-

Waterman, A. (2011). Improving Energy Efficiency and Reducing Code Size with RISC-V Compressed (Issue UCB/EECS-2011-63) [Master’s thesis]. University of California, Berkeley.

-
-
-

Waterman, A. (2016). Design of the RISC-V Instruction Set Architecture (Issue UCB/EECS-2016-1) [PhD thesis]. University of California, Berkeley.

-
-
-

Waterman, A., Lee, Y., Patterson, D. A., & Asanović, K. (2011). The RISC-V Instruction Set Manual, Volume I: Base User-Level ISA (UCB/EECS-2011-62; Issue UCB/EECS-2011-62). EECS Department, University of California, Berkeley.

-
-
-

Waterman, A., Lee, Y., Patterson, D. A., & Asanović, K. (2014). The RISC-V Instruction Set Manual, Volume I: Base User-Level ISA Version 2.0 (UCB/EECS-2014-54; Issue UCB/EECS-2014-54). EECS Department, University of California, Berkeley.

-
-
-
-
- - - \ No newline at end of file diff --git a/docs/Makefile b/docs/Makefile index d4bb2cbb9e..0e7527e705 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -1,20 +1,12 @@ -# Minimal makefile for Sphinx documentation -# +all: prepare sphinx -# You can set these variables from the command line, and also -# from the environment for the first two. -SPHINXOPTS ?= -SPHINXBUILD ?= sphinx-build -SOURCEDIR = . -BUILDDIR = _build +prepare: + make -C 04_cv32a65x/riscv priv-html unpriv-html + make -C 04_cv32a65x/design design-html -# Put it first so that "make" without argument is like "make help". -help: - @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + make -C 06_cv64a6_mmu/riscv priv-html unpriv-html -.PHONY: help Makefile +sphinx: + sphinx-build . _build -# Catch-all target: route all unknown targets to Sphinx using the new -# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). -%: Makefile - @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) +.PHONY: all prepare sphinx diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 0000000000..4b38946c53 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,26 @@ +# CVA6 documentation + +CVA6 documentation is published as a Read the Docs documentation. +It can be generated by running `make` in this directory. +This generates all necessary sub-documents. + +## Configuration-specific manuals + +For each supported target (e.g. `cv32a65x`), there are two manuals included in the main documentation: a tailored RISC-V instruction set manual, and a design documentation. +These documents are generated when building the main documentation. + +### Instruction set manual + +Instruction set manuals (privileged & unprivileged) are based on the official RISC-V Instruction Set Manual repository. +Some parts are stripped down or annotated to only include what's relevant for each specific configuration. + +These manuals can be manually generated with: `make -C 04_cv32a65x/riscv priv-html unpriv-html`. +Replace `04_cv32a65x` with the desired target. +Some of the files used in this documentation (`config.adoc`) are directly generated from the RTL. + +### Design documentation + +Design documentation describes the internal architecture of the CVA6 processor. + +It can be manually generated with: `make -C 04_cv32a65x/design design-html`. +Some of the files used in this documentation (`config.adoc`, `parameters.adoc`, `port_*.adoc`, `csr.adoc`, `isa.adoc`) are directly generated from the RTL. diff --git a/docs/_static/theme_overrides.css b/docs/_static/theme_overrides.css new file mode 100644 index 0000000000..ce5fd21446 --- /dev/null +++ b/docs/_static/theme_overrides.css @@ -0,0 +1,12 @@ +@media screen { + /* content column + * + * RTD theme's default is 800px as max width for the content, but we have + * tables with tons of columns, which need the full width of the view-port. + * + * Comment from yocto project theme_overrides.css + */ + + .wy-nav-content{ max-width: none; } + +} diff --git a/docs/riscv-isa/src/config_define.adoc b/docs/common/config_define.adoc similarity index 100% rename from docs/riscv-isa/src/config_define.adoc rename to docs/common/config_define.adoc diff --git a/docs/conf.py b/docs/conf.py index 4c7c11135b..08a862da29 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -37,7 +37,7 @@ # List of patterns, relative to source directory, that match files and # directories to ignore when looking for source files. # This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store', '*.yaml', '*.xml', +exclude_patterns = ['_build', '**/build', 'Thumbs.db', '.DS_Store', '*.yaml', '*.xml', 'csr-from-ip-xact/**/*_csr.md', 'csr-ip-xact/**/cva6_csr.*'] @@ -48,6 +48,7 @@ # #html_theme = 'alabaster' html_theme = 'sphinx_rtd_theme' +pygments_style = 'monokai' # Theme options are theme-specific and customize the look and feel of a theme # further. For a list of options available for each theme, see the @@ -61,7 +62,11 @@ # so a file named "default.css" will overwrite the builtin "default.css". #html_static_path = ['ystatic'] # Set html_static_path to null on the advice of RTDs: -html_static_path = [] +html_static_path = ['_static'] + +# Add customm CSS and JS files +html_css_files = ['theme_overrides.css'] +html_js_files = [] # Custom sidebar templates, must be a dictionary that maps document names # to template names. diff --git a/docs/design/build.mk b/docs/design/build.mk new file mode 100644 index 0000000000..c401016342 --- /dev/null +++ b/docs/design/build.mk @@ -0,0 +1,44 @@ +# Copyright 2024 Thales DIS France SAS +# Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); +# you may not use this file except in compliance with the License. +# SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 +# You may obtain a copy of the License at https://solderpad.org/licenses/ +# +# Original Author: Jean-Roch COULON - Thales + +ifeq ($(CONFIG),) +$(error CONFIG must be defined) +endif + +current_dir = $(shell pwd) + +# Path of current file, intended to be included by a configuration subfolder +design_dir := $(dir $(lastword $(MAKEFILE_LIST))) + +all: design-pdf design-html + +setup: + mkdir -p build + + cp -r $(design_dir)/design-manual/* build + cp -r $(design_dir)/../riscv-isa/riscv-isa-manual/docs-resources build + cp $(design_dir)/../common/*.adoc build/source + + cp -rf source/* build/source + + cd ../../../config/gen_from_riscv_config && python3 scripts/riscv_config_gen.py -s ../riscv-config/$(CONFIG)/generated/isa_gen.yaml -i templates/isa_template.yaml -m updaters/$(CONFIG)/isa_updater.yaml -t $(CONFIG) -f adoc + cd ../../../config/gen_from_riscv_config && python3 scripts/riscv_config_gen.py -s ../riscv-config/$(CONFIG)/generated/isa_gen.yaml -c ../riscv-config/$(CONFIG)/generated/custom_gen.yaml -m updaters/$(CONFIG)/csr_updater.yaml -t $(CONFIG) -f adoc + cp -r $(design_dir)/../../config/gen_from_riscv_config/$(CONFIG)/* build/source + + cd ../.. && python3 scripts/spec_builder.py --target $(CONFIG) --gen-config $(current_dir)/build/source/config.adoc --gen-parameters $(current_dir)/build/source/parameters.adoc --gen-ports $(current_dir)/build/source + +design-pdf: setup + cd build; make SKIP_DOCKER=true build/design.pdf + cp ./build/build/design.pdf design-$(CONFIG).pdf + +design-html: setup + cd build; make SKIP_DOCKER=true build/design.html + cp ./build/build/design.html design-$(CONFIG).html + +clean: + rm -rf build diff --git a/docs/design/design-manual/Makefile b/docs/design/design-manual/Makefile new file mode 100644 index 0000000000..e3bc3cbe1e --- /dev/null +++ b/docs/design/design-manual/Makefile @@ -0,0 +1,122 @@ +# Makefile for RISC-V ISA Manuals +# +# This work is licensed under the Creative Commons Attribution-ShareAlike 4.0 +# International License. To view a copy of this license, visit +# http://creativecommons.org/licenses/by-sa/4.0/ or send a letter to +# Creative Commons, PO Box 1866, Mountain View, CA 94042, USA. +# +# SPDX-License-Identifier: CC-BY-SA-4.0 +# +# Description: +# +# This Makefile is designed to automate the process of building and packaging +# the documentation for RISC-V ISA Manuals. It supports multiple build targets +# for generating documentation in various formats (PDF, HTML, EPUB). +# +# Building with a preinstalled docker container is recommended. +# Install by running: +# +# docker pull riscvintl/riscv-docs-base-container-image:latest +# + +DOCS := design + +DATE ?= $(shell date +%Y-%m-%d) +DOCKER_IMG := riscvintl/riscv-docs-base-container-image:latest +ifneq ($(SKIP_DOCKER),true) + DOCKER_CMD = \ + docker run --rm -v ${PWD}/$@.workdir:/build -w /build \ + ${DOCKER_IMG} \ + /bin/sh -c + DOCKER_QUOTE := " +else + DOCKER_CMD = \ + cd $@.workdir && +endif + +WORKDIR_SETUP = \ + rm -rf $@.workdir && \ + mkdir -p $@.workdir && \ + cp -r source docs-resources $@.workdir + +WORKDIR_TEARDOWN = \ + mv $@.workdir/$@ $@ && \ + rm -rf $@.workdir + +SRC_DIR := source +BUILD_DIR := build + +DOCS_PDF := $(addprefix $(BUILD_DIR)/, $(addsuffix .pdf, $(DOCS))) +DOCS_HTML := $(addprefix $(BUILD_DIR)/, $(addsuffix .html, $(DOCS))) +DOCS_EPUB := $(addprefix $(BUILD_DIR)/, $(addsuffix .epub, $(DOCS))) + +ENV := LANG=C.utf8 +XTRA_ADOC_OPTS := +ASCIIDOCTOR_PDF := $(ENV) asciidoctor-pdf +ASCIIDOCTOR_HTML := $(ENV) asciidoctor +ASCIIDOCTOR_EPUB := $(ENV) asciidoctor-epub3 +OPTIONS := --trace \ + -a compress \ + -a mathematical-format=svg \ + -a pdf-fontsdir=docs-resources/fonts \ + -a pdf-theme=docs-resources/themes/riscv-pdf.yml \ + $(XTRA_ADOC_OPTS) \ + -D build \ + --failure-level=WARN \ + -a attribute-missing=warn +REQUIRES := + +.PHONY: all build clean build-container build-no-container build-docs build-pdf build-html build-epub + +all: build + +build-docs: $(DOCS_PDF) $(DOCS_HTML) $(DOCS_EPUB) +build-pdf: $(DOCS_PDF) +build-html: $(DOCS_HTML) +build-epub: $(DOCS_EPUB) + +ALL_SRCS := $(shell git ls-files $(SRC_DIR)) + +$(BUILD_DIR)/%.pdf: $(SRC_DIR)/%.adoc $(ALL_SRCS) + $(WORKDIR_SETUP) + $(DOCKER_CMD) $(DOCKER_QUOTE) $(ASCIIDOCTOR_PDF) $(OPTIONS) $(REQUIRES) $< $(DOCKER_QUOTE) + $(WORKDIR_TEARDOWN) + +$(BUILD_DIR)/%.html: $(SRC_DIR)/%.adoc $(ALL_SRCS) + $(WORKDIR_SETUP) + $(DOCKER_CMD) $(DOCKER_QUOTE) $(ASCIIDOCTOR_HTML) $(OPTIONS) $(REQUIRES) $< $(DOCKER_QUOTE) + $(WORKDIR_TEARDOWN) + +$(BUILD_DIR)/%.epub: $(SRC_DIR)/%.adoc $(ALL_SRCS) + $(WORKDIR_SETUP) + $(DOCKER_CMD) $(DOCKER_QUOTE) $(ASCIIDOCTOR_EPUB) $(OPTIONS) $(REQUIRES) $< $(DOCKER_QUOTE) + $(WORKDIR_TEARDOWN) + +build: + @echo "Checking if Docker is available..." + @if command -v docker >/dev/null 2>&1 ; then \ + echo "Docker is available, building inside Docker container..."; \ + $(MAKE) build-container; \ + else \ + echo "Docker is not available, building without Docker..."; \ + $(MAKE) build-no-container; \ + fi + +build-container: + @echo "Starting build inside Docker container..." + $(MAKE) build-docs + @echo "Build completed successfully inside Docker container." + +build-no-container: + @echo "Starting build..." + $(MAKE) SKIP_DOCKER=true build-docs + @echo "Build completed successfully." + +# Update docker image to latest +docker-pull-latest: + docker pull ${DOCKER_IMG} + +clean: + @echo "Cleaning up generated files..." + rm -rf $(BUILD_DIR) + @echo "Cleanup completed." diff --git a/docs/04_cv32a65x/design/source/AXI.rst b/docs/design/design-manual/source/AXI.adoc similarity index 86% rename from docs/04_cv32a65x/design/source/AXI.rst rename to docs/design/design-manual/source/AXI.adoc index 595f64397f..e8bf7b80d9 100644 --- a/docs/04_cv32a65x/design/source/AXI.rst +++ b/docs/design/design-manual/source/AXI.adoc @@ -1,4 +1,4 @@ -.. +//// Copyright 2023 Thales DIS France SAS Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -6,5 +6,7 @@ You may obtain a copy of the License at https://solderpad.org/licenses/ Original Author: Jean-Roch COULON - Thales +//// +[[axi]] -.. include:: ../../../01_cva6_user/AXI_Interface.rst +include::AXI_Interface.adoc[] diff --git a/docs/design/design-manual/source/AXI_Interface.adoc b/docs/design/design-manual/source/AXI_Interface.adoc new file mode 100644 index 0000000000..1da1cf31c0 --- /dev/null +++ b/docs/design/design-manual/source/AXI_Interface.adoc @@ -0,0 +1,895 @@ +//// + Copyright (c) 2023 OpenHW Group + Copyright (c) 2023 Thales + + SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 + + Original Author: Alae Eddine EZ ZEJJARI (alae-eddine.ez-zejjari@external.thalesgroup.com) +//// + +[[cva6_axi]] +AXI +~~~ + +[[cva6_axi-introduction]] +Introduction +^^^^^^^^^^^^ +In this chapter, we describe in detail the restriction that apply to the supported features. + +In order to understand how the AXI memory interface behaves in CVA6, it is necessary to read the AMBA AXI and ACE Protocol Specification (https://developer.arm.com/documentation/ihi0022/hc) and this chapter. + +_Applicability of this chapter to configurations:_ + +[cols=",",options="header",] +|============================= +|Configuration |Implementation +|CV32A60AX |AXI included +|CV32A60X |AXI included +|============================= + +[[about-the-axi4-protocol]] +About the AXI4 protocol ++++++++++++++++++++++++ + +The AMBA AXI protocol supports high-performance, high-frequency system designs for communication between Manager and Subordinate components. + +The AXI protocol features are: + +* It is suitable for high-bandwidth and low-latency designs. +* High-frequency operation is provided, without using complex bridges. +* The protocol meets the interface requirements of a wide range of components. +* It is suitable for memory controllers with high initial access latency. +* Flexibility in the implementation of interconnect architectures is provided. +* It is backward-compatible with AHB and APB interfaces. + +The key features of the AXI protocol are: + +* Separate address/control and data phases. +* Support for unaligned data transfers, using byte strobes. +* Uses burst-based transactions with only the start address issued. +* Separate read and write data channels, that can provide low-cost Direct Memory Access (DMA). +* Support for issuing multiple outstanding addresses. +* Support for out-of-order transaction completion. +* Permits easy addition of register stages to provide timing closure. + +The present specification is based on: https://developer.arm.com/documentation/ihi0022/hc + + +[[axi4-and-cva6]] +AXI4 and CVA6 ++++++++++++++ + +The AXI bus protocol is used with the CVA6 processor as a memory interface. Since the processor is the one that initiates the connection with the memory, it will have a manager interface to send requests to the subordinate, which will be the memory. + +Features supported by CVA6 are the ones in the AMBA AXI4 specification and the Atomic Operation feature from AXI5. With restriction that apply to some features. + +This doesn’t mean that all the full set of signals available on an AXI interface are supported by the CVA6. Nevertheless, all required AXI signals are implemented. + +Supported AXI4 features are defined in AXI Protocol Specification sections: A3, A4, A5, A6 and A7. + +Supported AXI5 feature are defined in AXI Protocol Specification section: E1.1. + + +[[signal-description-section-a2]] +Signal Description (Section A2) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This section introduces the AXI memory interface signals of CVA6. Most of the signals are supported by CVA6, the tables summarizing the signals identify the exceptions. + +In the following tables, the *Src* column tells whether the signal is driven by Manager ou Subordinate. + +The AXI required and optional signals, and the default signals values that apply when an optional signal is not implemented are defined in AXI Protocol Specification section A9.3. + +[[global-signals-section-a2.1]] +Global signals (Section A2.1) ++++++++++++++++++++++++++++++ + +Table 2.1 shows the global AXI memory interface signals. + +[width="100%",cols="20%,20%,60%",options="header",] +|========================================================== +|*Signal* |*Src* |*Description* +|*ACLK* |Clock source a| +[verse] +-- +Global clock signal. Synchronous signals are sampled on the +rising edge of the global clock. +-- + +|*WDATA* |Reset source a| +[verse] +-- +Global reset signal. This signal is active-LOW. +-- + +|========================================================== + +[[write-address-channel-signals-section-a2.2]] +Write address channel signals (Section A2.2) +++++++++++++++++++++++++++++++++++++++++++++ + +Table 2.2 shows the AXI memory interface write address channel signals. Unless the description indicates otherwise, a signal can take any parameter if is supported. + +[width="100%",cols="15%,15%,15%,55%",options="header",] +|===================================================================== +|*Signal* |*Src* |*Support* |*Description* +|*AWID* |M a| +[verse] +-- +Yes +(optional) +-- + + a| +[verse] +-- +Identification tag for a write transaction. +CVA6 gives the id depending on the type of transaction. +See transaction_identifiers_label. +-- + +|*AWADDR* |M |Yes a| +[verse] +-- +The address of the first transfer in a write transaction. +-- + +|*AWLEN* |M a| +[verse] +-- +Yes +(optional) +-- + + a| +[verse] +-- +Length, the exact number of data transfers in a write +transaction. This information determines the number of +data transfers associated with the address. +All write transactions performed by CVA6 are of length 1. +(AWLEN = 0b00000000) +-- + +|*AWSIZE* |M a| +[verse] +-- +Yes +(optional) +-- + + a| +[verse] +-- +Size, the number of bytes in each data transfer in a write +transaction +See address_structure_label. +-- + +|*AWBURST* |M a| +[verse] +-- +Yes +(optional) +-- + + a| +[verse] +-- +Burst type, indicates how address changes between each +transfer in a write transaction. +All write transactions performed by CVA6 are of burst type +INCR. (AWBURST = 0b01) +-- + +|*AWLOCK* |M a| +[verse] +-- +Yes +(optional) +-- + + a| +[verse] +-- +Provides information about the atomic characteristics of a +write transaction. +-- + +|*AWCACHE* |M a| +[verse] +-- +Yes +(optional) +-- + + a| +[verse] +-- +Indicates how a write transaction is required to progress +through a system. +The subordinate is always of type Normal Non-cacheable Non-bufferable. +(AWCACHE = 0b0010) +-- + +|*AWPROT* |M |Yes a| +[verse] +-- +Protection attributes of a write transaction: +privilege, security level, and access type. +The value of AWPROT is always 0b000. +-- + +|*AWQOS* |M a| +[verse] +-- +No +(optional) +-- + + a| +[verse] +-- +Quality of Service identifier for a write transaction. +AWQOS = 0b0000 +-- + +|*AWREGION* |M a| +[verse] +-- +No +(optional) +-- + + a| +[verse] +-- +Region indicator for a write transaction. +AWREGION = 0b0000 +-- + +|*AWUSER* |M a| +[verse] +-- +No +(optional) +-- + + a| +[verse] +-- +User-defined extension for the write address channel. +AWUSER = 0b00 +-- + +|*AWATOP* |M a| +[verse] +-- +Yes +(optional) +-- + + a| +[verse] +-- +AWATOP indicates the Properties of the Atomic Operation +used for a write transaction. +See atomic_transactions_label. +-- + +|*AWVALID* |M |Yes a| +[verse] +-- +Indicates that the write address channel signals are valid. +-- + +|*AWREADY* |S |Yes a| +[verse] +-- +Indicates that a transfer on the write address channel +can be accepted. +-- + +|===================================================================== + +[[write-data-channel-signals-section-a2.3]] +Write data channel signals (Section A2.3) ++++++++++++++++++++++++++++++++++++++++++ + +Table 2.3 shows the AXI write data channel signals. Unless the description indicates otherwise, a signal can take any parameter if is supported. + +[width="100%",cols="15%,15%,15%,55%",options="header",] +|========================================================== +|*Signal* |*Src* |*Support* |*Description* +|*WDATA* |M |Yes a| +[verse] +-- +Write data. +-- + +|*WSTRB* |M a| +[verse] +-- +Yes +(optional) +-- + + a| +[verse] +-- +Write strobes, indicate which byte lanes hold valid data +See data_read_and_write_structure_label. +-- + +|*WLAST* |M |Yes a| +[verse] +-- +Indicates whether this is the last data transfer in a write +transaction. +-- + +|*WUSER* |M a| +[verse] +-- +Yes +(optional) +-- + + a| +[verse] +-- +User-defined extension for the write data channel. +-- + +|*WVALID* |M |Yes a| +[verse] +-- +Indicates that the write data channel signals are valid. +-- + +|*WREADY* |S |Yes a| +[verse] +-- +Indicates that a transfer on the write data channel can be +accepted. +-- + +|========================================================== + +[[write-response-channel-signals-section-a2.4]] +Write Response Channel signals (Section A2.4) ++++++++++++++++++++++++++++++++++++++++++++++ + +Table 2.4 shows the AXI write response channel signals. Unless the description indicates otherwise, a signal can take any parameter if is supported. + +[width="100%",cols="15%,15%,15%,55%",options="header",] +|============================================================= +|*Signal* |*Src* |*Support* |*Description* +|*BID* |S a| +[verse] +-- +Yes +(optional) +-- + + a| +[verse] +-- +Identification tag for a write response. +CVA6 gives the id depending on the type of transaction. +See transaction_identifiers_label. +-- + +|*BRESP* |S |Yes a| +[verse] +-- +Write response, indicates the status of a write transaction. +See read_and_write_response_structure_label. +-- + +|*BUSER* |S a| +[verse] +-- +No +(optional) +-- + + a| +[verse] +-- +User-defined extension for the write response channel. +Not supported. +-- + +|*BVALID* |S |Yes a| +[verse] +-- +Indicates that the write response channel signals are valid. +-- + +|*BREADY* |M |Yes a| +[verse] +-- +Indicates that a transfer on the write response channel can be +accepted. +-- + +|============================================================= + +[[read-address-channel-signals-section-a2.5]] +Read address channel signals (Section A2.5) ++++++++++++++++++++++++++++++++++++++++++++ + +Table 2.5 shows the AXI read address channel signals. Unless the description indicates otherwise, a signal can take any parameter if is supported. + + +[width="100%",cols="15%,15%,15%,55%",options="header",] +|================================================================ +|*Signal* |*Src* |*Support* |*Description* +|*ARID* |M a| +[verse] +-- +Yes +(optional) +-- + + a| +[verse] +-- +Identification tag for a read transaction. +CVA6 gives the id depending on the type of transaction. +See transaction_identifiers_label. +-- + +|*ARADDR* |M a| +[verse] +-- +Yes +-- + + a| +[verse] +-- +The address of the first transfer in a read transaction. +-- + +|*ARLEN* |M a| +[verse] +-- +Yes +(optional) +-- + + a| +[verse] +-- +Length, the exact number of data transfers in a read +transaction. This information determines the number of data +transfers associated with the address. +All read transactions performed by CVA6 have a length equal to 0, +ICACHE_LINE_WIDTH/64 or DCACHE_LINE_WIDTH/64. +-- + +|*ARSIZE* |M a| +[verse] +-- +Yes +(optional) +-- + + a| +[verse] +-- +Size, the number of bytes in each data transfer in a read +transaction +See address_structure_label. +-- + +|*ARBURST* |M a| +[verse] +-- +Yes +(optional) +-- + + a| +[verse] +-- +Burst type, indicates how address changes between each +transfer in a read transaction. +All Read transactions performed by CVA6 are of burst type INCR. +(ARBURST = 0b01) +-- + +|*ARLOCK* |M a| +[verse] +-- +Yes +(optional) +-- + + a| +[verse] +-- +Provides information about the atomic characteristics of +a read transaction. +-- + +|*ARCACHE* |M a| +[verse] +-- +Yes +(optional) +-- + + a| +[verse] +-- +Indicates how a read transaction is required to progress +through a system. +The memory is always of type Normal Non-cacheable Non-bufferable. +(ARCACHE = 0b0010) +-- + +|*ARPROT* |M a| +[verse] +-- +Yes +-- + + a| +[verse] +-- +Protection attributes of a read transaction: +privilege, security level, and access type. +The value of ARPROT is always 0b000. +-- + +|*ARQOS* |M a| +[verse] +-- +No +(optional) +-- + + a| +[verse] +-- +Quality of Service identifier for a read transaction. +ARQOS= 0b00 +-- + +|*ARREGION* |M a| +[verse] +-- +No +(optional) +-- + + a| +[verse] +-- +Region indicator for a read transaction. +ARREGION= 0b00 +-- + +|*ARUSER* |M a| +[verse] +-- +No +(optional) +-- + + a| +[verse] +-- +User-defined extension for the read address channel. +ARUSER= 0b00 +-- + +|*ARVALID* |M a| +[verse] +-- +Yes +(optional) +-- + + a| +[verse] +-- +Indicates that the read address channel signals are valid. +-- + +|*ARREADY* |S a| +[verse] +-- +Yes +(optional) +-- + + a| +[verse] +-- +Indicates that a transfer on the read address channel can be +accepted. +-- + +|================================================================ + +[[read-data-channel-signals-section-a2.6]] +Read data channel signals (Section A2.6) +++++++++++++++++++++++++++++++++++++++++ + +Table 2.6 shows the AXI read data channel signals. Unless the description indicates otherwise, a signal can take any parameter if is supported. + + +[width="100%",cols="15%,15%,15%,55%",options="header",] +|================================================================== +|*Signal* |*Src* |*Support* |*Description* +|*RID* |S a| +[verse] +-- +Yes +(optional) +-- + + a| +[verse] +-- +The ID tag of the read data transfer. +CVA6 gives the id depending on the type of transaction. +See transaction_identifiers_label. +-- + +|*RDATA* |S |Yes a| +[verse] +-- +Read data. +-- + +|*RLAST* |S |Yes a| +[verse] +-- +Indicates whether this is the last data transfer in a read +transaction. +-- + +|*RUSER* |S a| +[verse] +-- +Yes +(optional) +-- + + a| +[verse] +-- +User-defined extension for the read data channel. +Not supported. +-- + +|*RVALID* |S |Yes a| +[verse] +-- +Indicates that the read data channel signals are valid. +-- + +|*RREADY* |M |Yes a| +[verse] +-- +Indicates that a transfer on the read data channel can be accepted. +-- + +|================================================================== + +[[single-interface-requirements-transaction-structure-section-a3.4]] +Single Interface Requirements: Transaction structure (Section A3.4) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This section describes the structure of transactions. The following sections define the address, data, and response +structures + +[[address_structure_label]] +Address structure (Section A3.4.1) +++++++++++++++++++++++++++++++++++ + +The AXI protocol is burst-based. The Manager begins each burst by driving control information and the address of the first byte in the transaction to the Subordinate. As the burst progresses, the Subordinate must calculate the addresses of subsequent transfers in the burst. + +*Burst length* + +The burst length is specified by: + +* `ARLEN[7:0]`, for read transfers +* `AWLEN[7:0]`, for write transfers + +The burst length for AXI4 is defined as: `Burst_Length = AxLEN[3:0] + 1`. + +CVA6 has some limitation governing the use of bursts: + +* _All read transactions performed by CVA6 are of burst length equal to 0, ICACHE_LINE_WIDTH/64 or DCACHE_LINE_WIDTH/64._ +* _All write transactions performed by CVA6 are of burst length equal to 1._ + +*Burst size* + +The maximum number of bytes to transfer in each data transfer, or beat, in a burst, is specified by: + +* `ARSIZE[2:0]`, for read transfers +* `AWSIZE[2:0]`, for write transfers + +_The maximum value can be taking by AxSIZE is log2(AXI DATA WIDTH/8) (8 bytes by transfer)._ +_If(RV32) AWSIZE < 3 (The maximum store size is 4 bytes)_ + +*Burst type* + +The AXI protocol defines three burst types: + +* *FIXED* +* *INCR* +* *WRAP* + +The burst type is specified by: + +* `ARBURST[1:0]`, for read transfers +* `AWBURST[1:0]`, for write transfers + +_All transactions performed by CVA6 are of burst type INCR. (AxBURST = 0b01)_ + +[[data_read_and_write_structure_label]] +Data read and write structure: (Section A3.4.4) ++++++++++++++++++++++++++++++++++++++++++++++++ + +*Write strobes* + +The `WSTRB[n:0]` signals when HIGH, specify the byte lanes of the data bus that contain valid information. There is one write strobe +for each 8 bits of the write data bus, therefore `WSTRB[n]` corresponds to `WDATA[(8n)+7: (8n)]`. + +_Write Strobe width is equal to (AXI_DATA_WIDTH/8) (n = (AXI_DATA_WIDTH/8)-1)._ + +_The size of transactions performed by cva6 is equal to the number of data byte lanes containing valid information._ +_This means 1, 2, 4, ... or (AXI_DATA_WIDTH/8) byte lanes containing valid information._ +_CVA6 doesn't perform unaligned memory acces, therefore the WSTRB take only combination of aligned access_ +_If(RV32) WSTRB < 255 (Since AWSIZE lower than 3, so the data bus cannot have more than 4 valid byte lanes)_ + +*Unaligned transfers* + +For any burst that is made up of data transfers wider than 1 byte, the first bytes accessed might be unaligned with the natural +address boundary. For example, a 32-bit data packet that starts at a byte address of 0x1002 is not aligned to the natural 32-bit +transfer size. + +_CVA6 does not perform Unaligned transfers._ + +[[read_and_write_response_structure_label]] +Read and write response structure (Section A3.4.5) +++++++++++++++++++++++++++++++++++++++++++++++++++ + +The AXI protocol provides response signaling for both read and write transactions: + +* For read transactions, the response information from the Subordinate is signaled on the read data channel. +* For write transactions, the response information is signaled on the write response channel. + +CVA6 does not consider the responses sent by the memory except in the exclusive Access ( `XRESP[1:0]` = 0b01 ). + +[[transaction-attributes-memory-types-section-a4]] +Transaction Attributes: Memory types (Section A4) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +This section describes the attributes that determine how a transaction should be treated by the AXI subordinate that is connected to the CVA6. + +`AxCACHE` always takeq 0b0010. The subordinate should be a Normal Non-cacheable Non-bufferable. + +The required behavior for Normal Non-cacheable Non-bufferable memory is: + +* The write response must be obtained from the final destination. +* Read data must be obtained from the final destination. +* Transactions are modifiable. +* Writes can be merged. + +[[transaction_identifiers_label]] +Transaction Identifiers (Section A5) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +The AXI protocol includes AXI ID transaction identifiers. A Manager can use these to identify separate transactions that must be returned in order. + +The CVA6 identify each type of transaction with a specific ID: + +* For read transaction, id can be 0 or 1. (0 for instruction fetch and 1 for data) +* For write transaction, id = 1. +* For Atomic operation, id = 3. This ID must be sent in the write channels and also in the read channel if the transaction performed requires response data. +* For Exclusive transaction, id = 3. + +[[axi-ordering-model-section-a6]] +AXI Ordering Model (Section A6) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +[[axi-ordering-model-overview-section-a6.1]] +AXI ordering model overview (Section A6.1) +++++++++++++++++++++++++++++++++++++++++++ + +The AXI ordering model is based on the use of the transaction identifier, which is signaled on `ARID` or `AWID`. + +Transaction requests on the same channel, with the same ID and destination are guaranteed to remain in order. + +Transaction responses with the same ID are returned in the same order as the requests were issued. + +Write transaction requests, with the same destination are guaranteed to remain in order. Because all write transaction performed by CVA6 have the same ID. + +CVA6 can perform multiple outstanding write address transactions. + +CVA6 cannot perform a Read transaction and a Write one at the same time. Therefore there no ordering problems between Read and write transactions. + +The ordering model does not give any ordering guarantees between: + +* Transactions from different Managers +* Read Transactions with different IDs +* Transactions to different Memory locations + +If the CVA6 requires ordering between transactions that have no ordering guarantee, the Manager must wait to receive a response to the first transaction before issuing the second transaction. + +[[memory-locations-and-peripheral-regions-section-a6.2]] +Memory locations and Peripheral regions (Section A6.2) +++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +The address map in AMBA is made up of Memory locations and Peripheral regions. But the AXI is associated to the memory interface of CVA6. + +A Memory location has all of the following properties: + +* A read of a byte from a Memory location returns the last value that was written to that byte location. +* A write to a byte of a Memory location updates the value at that location to a new value that is obtained by a subsequent read of that location. +* Reading or writing to a Memory location has no side-effects on any other Memory location. +* Observation guarantees for Memory are given for each location. +* The size of a Memory location is equal to the single-copy atomicity size for that component. + +[[transactions-and-ordering-section-a6.3]] +Transactions and ordering (Section A6.3) +++++++++++++++++++++++++++++++++++++++++ + +A transaction is a read or a write to one or more address locations. The locations are determined by AxADDR and any relevant qualifiers such as the Non-secure bit in `AxPROT`. + +* Ordering guarantees are given only between accesses to the same Memory location or Peripheral region. +* A transaction to a Peripheral region must be entirely contained within that region. +* A transaction that spans multiple Memory locations has multiple ordering guarantees. + +Transaction performed by CVA6 is of type Normal, because `AxCACHE[1]` is asserted. + +Normal transactions are used to access Memory locations and are not expected to be used to access Peripheral regions. + +A Normal access to a Peripheral region must complete in a protocol-compliant manner, but the result is IMPLEMENTATION DEFINED. + +A write transaction performed by CVA6 is Non-bufferable (It is not possible to send an early response before the transaction reach the final destination), because `AxCACHE[0]` is deasserted. + +[[ordered-write-observation-section-a6.8]] +Ordered write observation (Section A6.8) +++++++++++++++++++++++++++++++++++++++++ + +To improve compatibility with interface protocols that support a different ordering model, a Subordinate interface can give stronger ordering guarantees for write transactions. A stronger ordering guarantee is known as Ordered Write Observation. + +_The CVA6 AXI interface exhibits Ordered Write Observation, so the Ordered_Write_Observation property is True._ + +An interface that exhibits Ordered Write Observation gives guarantees for write transactions that are not dependent on the destination or address: + +* A write W1 is guaranteed to be observed by a write W2, where W2 is issued after W1, from the same Manager, with the same ID. + +[[atomic_transactions_label]] +Atomic transactions (Section E1.1) +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +AMBA 5 introduces Atomic transactions, which perform more than just a single access and have an operation that is associated with the transaction. Atomic transactions enable sending the operation to the data, permitting the operation to be performed closer to where the data is located. Atomic transactions are suited to situations where the data is located a significant distance from the agent that must perform the operation. + +_If(RVA) AWATOP = 0 (If AMO instructions are not supported, CVA6 cannot perform Atomic transaction)_ + +_CVA6 supports just the AtomicLoad and AtomicSwap transaction. So `AWATOP[5:4]` can be 00, 10 or 11._ + +_CVA6 performs only little-endian operation. So `AWATOP[3]` = 0._ + +_For AtomicLoad, CVA6 supports all arithmetic operations encoded on the lower-order `AWATOP[2:0]` signals._ + +[[cva6-constraints]] +CVA6 Constraints +^^^^^^^^^^^^^^^^ + +This section describes cross-cases between several features that are not supported by CVA6. + +* ARID = 0 && ARSIZE = log(AXI_DATA_WIDTH/8), CVA6 always requests max number of words in case of read transaction with ID 0 (instruction fetch) +* if(RV32) ARSIZE != 3 && ARLEN = 0 && ARID = 1, the maximum load instruction size is 4 bytes +* if(!RVA) AxLOCK = 0, if AMO instructions are not supported, CVA6 cannot perform exclusive transaction +* if(RVA) AxLOCK = 1 => AxSIZE > 1, CVA6 doesn't perform exclusive transaction with size lower than 4 bytes diff --git a/docs/design/design-manual/source/CSRs.adoc b/docs/design/design-manual/source/CSRs.adoc new file mode 100644 index 0000000000..7c1c0b7b64 --- /dev/null +++ b/docs/design/design-manual/source/CSRs.adoc @@ -0,0 +1,3 @@ +[[csrs]] + +include::csr/csr.adoc[] \ No newline at end of file diff --git a/docs/04_cv32a65x/design/source/csr_list.rst b/docs/design/design-manual/source/CVXIF.adoc similarity index 85% rename from docs/04_cv32a65x/design/source/csr_list.rst rename to docs/design/design-manual/source/CVXIF.adoc index 07373e9c5f..359125ad06 100644 --- a/docs/04_cv32a65x/design/source/csr_list.rst +++ b/docs/design/design-manual/source/CVXIF.adoc @@ -1,4 +1,4 @@ -.. +//// Copyright 2023 Thales DIS France SAS Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -6,6 +6,8 @@ You may obtain a copy of the License at https://solderpad.org/licenses/ Original Author: Jean-Roch COULON - Thales +//// +[[cvxif]] -.. include:: ../../../csr-from-ip-xact/cv32a60x/csr_list.rst +include::CVX_Interface_Coprocessor.adoc[] diff --git a/docs/design/design-manual/source/CVX_Interface_Coprocessor.adoc b/docs/design/design-manual/source/CVX_Interface_Coprocessor.adoc new file mode 100644 index 0000000000..4290a1372e --- /dev/null +++ b/docs/design/design-manual/source/CVX_Interface_Coprocessor.adoc @@ -0,0 +1,294 @@ +[[cva6_cvx_interface_coprocessor]] +CV-X-IF Interface and Coprocessor +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +The CV-X-IF interface of CVA6 allows to extend its supported instruction +set with external coprocessors. + +_Applicability of this chapter to configurations:_ + +[cols=",",options="header",] +|============================= +|Configuration |Implementation +|CV32A60AX |CV-X-IF included +|CV32A60X |CV-X-IF included +|CV64A6_MMU |CV-X-IF included +|============================= + +[[cv-x-if-interface-specification]] +CV-X-IF interface specification +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +[[description]] +Description ++++++++++++ + +This design specification presents global functionalities of +Core-V-eXtension-Interface (XIF, CVXIF, CV-X-IF, X-interface) in the CVA6 core. + + +[source,sourceCode,text] +---- +The CORE-V X-Interface is a RISC-V eXtension interface that provides a +generalized framework suitable to implement custom coprocessors and ISA +extensions for existing RISC-V processors. + +--core-v-xif Readme, https://github.com/openhwgroup/core-v-xif +---- + +The specification of the CV-X-IF bus protocol can be found at [CV-X-IF]. + +CV-X-IF aims to: + +* Create interfaces to connect a coprocessor to the CVA6 to execute instructions. +* Offload CVA6 illegal instrutions to the coprocessor to be executed. +* Get the results of offloaded instructions from the coprocessor so they are written back into the CVA6 register file. +* Add standard RISC-V instructions unsupported by CVA6 or custom instructions and implement them in a coprocessor. +* Kill offloaded instructions to allow speculative execution in the coprocessor. (Unsupported in CVA6 yet) +* Connect the coprocessor to memory via the CVA6 Load and Store Unit. (Unsupported in CVA6 yet) + +The coprocessor operates like another functional unit so it is connected +to the CVA6 in the execute stage. + +Only the 3 mandatory interfaces from the CV-X-IF specification (issue, commit and result +) have been implemented. +Compressed interface, Memory Interface and Memory result interface are not yet +implemented in the CVA6. + +[[supported-parameters]] +Supported Parameters +++++++++++++++++++++ + +The following table presents CVXIF parameters supported by CVA6. + +[cols=",,",options="header",] +|============================================= +|Signal |Value |Description +|*X_NUM_RS* |int: 2 or 3 (configurable) a| +[verse] +-- +Number of register file read ports that can +be used by the eXtension interface +-- + + | +|*X_ID_WIDTH* |int: 3 a| +[verse] +-- +Identification width for the eXtension +interface +-- + + | +|*X_MEM_WIDTH* |n/a (feature not supported) a| +[verse] +-- +Memory access width for loads/stores via the +eXtension interface +-- + + | +|*X_RFR_WIDTH* |int: `XLEN` (32 or 64) a| +[verse] +-- +Register file read access width for the +eXtension interface +-- + + | +|*X_RFW_WIDTH* |int: `XLEN` (32 or 64) a| +[verse] +-- +Register file write access width for the +eXtension interface +-- + + | +|*X_MISA* |logic[31:0]: 0x0000_0000 a| +[verse] +-- +MISA extensions implemented on the eXtension +interface +-- + + | +|============================================= + +[[cv-x-if-enabling]] +CV-X-IF Enabling +++++++++++++++++ + +CV-X-IF can be enabled or disabled via the `CVA6ConfigCvxifEn` parameter in the SystemVerilog source code. + +[[illegal-instruction-decoding]] +Illegal instruction decoding +++++++++++++++++++++++++++++ + +The CVA6 decoder module detects illegal instructions for the CVA6, prepares exception field +with relevant information (exception code "ILLEGAL INSTRUCTION", instruction value). + +The exception valid flag is raised in CVA6 decoder when CV-X-IF is disabled. Otherwise +it is not raised at this stage because the decision belongs to the coprocessor +after the offload process. + +[[rs3-support]] +RS3 support ++++++++++++ + +The number of source registers used by the CV-X-IF coprocessor is configurable with 2 or +3 source registers. + +If CV-X-IF is enabled and configured with 3 source registers, +a third read port is added to the CVA6 general purpose register file. + +[[description-of-interface-connections-between-cva6-and-coprocessor]] +Description of interface connections between CVA6 and Coprocessor ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ + +In CVA6 execute stage, there is a new functional unit dedicated to drive the CV-X-IF interfaces. +Here is _how_ and _to what_ CV-X-IF interfaces are connected to the CVA6. + +* Issue interface:: + ** Request;; + *** [verse] + -- + Operands are connected to `issue_req.rs` signals + -- + *** [verse] + -- + Scoreboard transaction id is connected to `issue_req.id` signal. + Therefore scoreboard ids and offloaded instruction ids are linked + together (equal in this implementation). It allows the CVA6 to do out + of order execution with the coprocessor in the same way as other + functional units. + -- + *** [verse] + -- + Undecoded instruction is connected to `issue_req.instruction` + -- + *** [verse] + -- + Valid signal for CVXIF functional unit is connected to + `issue_req.valid` + -- + *** [verse] + -- + All `issue_req.rs_valid` signals are set to 1. The validity of source + registers is assured by the validity of valid signal sent from issue stage. + -- + ** Response;; + *** [verse] + -- + If `issue_resp.accept` is set during a transaction (i.e. issue valid + and ready are set), the offloaded instruction is accepted by the coprocessor + and a result transaction will happen. + -- + *** [verse] + -- + If `issue_resp.accept` is not set during a transaction, the offloaded + instruction is illegal and an illegal instruction exception will be + raised as soon as no result transaction are written on the writeback bus. + -- +* Commit interface:: + ** [verse] + -- + Valid signal of commit interface is connected to the valid signal of + issue interface. + -- + ** [verse] + -- + Id signal of commit interface is connected to issue interface id signal + (i.e. scoreboard id). + -- + ** [verse] + -- + Killing of offload instruction is never set. (Unsupported feature) + -- + ** [verse] + -- + Therefore all accepted offloaded instructions are commited to their + execution and no killing of instruction is possible in this implementation. + -- +* Result interface:: + ** Request;; + *** [verse] + -- + Ready signal of result interface is always set as CVA6 is always ready + to take a result from coprocessor for an accepted offloaded instruction. + -- + ** Response;; + *** [verse] + -- + Result response is directly connected to writeback bus of the CV-X-IF + functionnal unit. + -- + *** [verse] + -- + Valid signal of result interface is connected to valid signal of + writeback bus. + -- + *** [verse] + -- + Id signal of result interface is connected to scoreboard id of + writeback bus. + -- + *** [verse] + -- + Write enable signal of result interface is connected to a dedicated CV-X-IF WE + signal in CVA6 which signals scoreboard if a writeback should happen + or not to the CVA6 register file. + -- + *** [verse] + -- + `exccode` and `exc` signal of result interface are connected to exception + signals of writeback bus. Exception from coprocessor does not write + the `tval` field in exception signal of writeback bus. + -- + *** [verse] + -- + Three registers are added to hold illegal instruction information in + case a result transaction and a non-accepted issue transaction happen + in the same cycle. Result transactions will be written to the writeback + bus in this case having priority over the non-accepted instruction due + to being linked to an older offloaded instruction. Once the writeback + bus is free, an illegal instruction exception will be raised thanks to + information held in these three registers. + -- + +[[coprocessor-recommendations-for-use-with-cva6s-cv-x-if]] +Coprocessor recommendations for use with CVA6's CV-X-IF +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +CVA6 supports all coprocessors supporting the CV-X-IF specification with the exception of : + +* Coprocessor requiring the Memory interface and Memory result interface (not implemented in CVA6 yet).:: + ** All memory transaction should happen via the Issue interface, i.e. Load into CVA6 register file + then initialize an issue transaction. +* Coprocessor requiring the Compressed interface (not implemented in CVA6 yet).:: + ** RISC-V Compressed extension (RVC) is already implemented in CVA6 User Space for custom compressed instruction + is not big enough to have RVC and a custom compressed extension. +* Stateful coprocessors.:: + ** CVA6 will commit on the Commit interface all its issue transactions. Speculation + informations are only kept in the CVA6 and speculation process is only done in CVA6. + The coprocessor shall be stateless otherwise it will not be able to revert its state if CVA6 kills an + in-flight instruction (in case of mispredict or flush). + +[[how-to-use-cva6-without-cv-x-if-interface]] +How to use CVA6 without CV-X-IF interface +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Select a configuration with `CVA6ConfigCvxifEn` parameter disabled or change it for your configuration. + +Never let the CV-X-IF interface unconnected with the `CVA6ConfigCvxifEn` parameter enabled. + +[[how-to-design-a-coprocessor-for-the-cv-x-if-interface]] +How to design a coprocessor for the CV-X-IF interface +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +_The team is looking for a contributor to write this section._ + +[[how-to-program-a-cv-x-if-coprocessor]] +How to program a CV-X-IF coprocessor +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +_The team is looking for a contributor to write this section._ diff --git a/docs/design/design-manual/source/Traps_Interrupts_Exceptions.adoc b/docs/design/design-manual/source/Traps_Interrupts_Exceptions.adoc new file mode 100644 index 0000000000..078e02584e --- /dev/null +++ b/docs/design/design-manual/source/Traps_Interrupts_Exceptions.adoc @@ -0,0 +1,129 @@ +[[traps-interrupts-exceptions]] +Traps, Interrupts, Exceptions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Traps are composed of interrupts and exceptions. +Interrupts are asynchronous events whereas exceptions are synchronous ones. +On one hand, interrupts are occuring independently of the instructions +(mainly raised by peripherals or debug module). +On the other hand, an instruction may raise exceptions synchronously. + +[[raising-traps]] +Raising Traps +^^^^^^^^^^^^^ + +When a trap is raised, the behaviour of the CVA6 core depends on +several CSRs and some CSRs are modified. + +[[configuration-csrs]] +Configuration CSRs +++++++++++++++++++ + +CSRs having an effect on the core behaviour when a trap occurs are: + +* `mstatus` and `sstatus`: several fields control the core behaviour like interrupt enable (`MIE`, `SIE`) +* `mtvec` and `stvec`: specifies the address of trap handler. +* `medeleg`: specifies which exceptions can be handled by a lower privileged mode (S-mode) +* `mideleg`: specifies which interrupts can be handled by a lower privileged mode (S-mode) + +[[modified-csrs]] +Modified CSRs ++++++++++++++ + +CSRs (or fields) updated by the core when a trap occurs are: + +* `mstatus` or `sstatus`: several fields are updated like previous privilege mode (`MPP`, `SPP`), previous interrupt enabled (`MPIE`, SPIE``) +* `mepc` or `sepc`: updated with the virtual address of the interrupted instruction or the instruction raising the exception. +* `mcause` or `scause`: updated with a code indicating the event causing the trap. +* `mtval` or `stval`: updated with exception specific information like the faulting virtual address + +[[supported-exceptions]] +Supported exceptions +++++++++++++++++++++ + +The following exceptions are supported by the CVA6: + +* instruction address misaligned +** control flow instruction with misaligned target + +* instruction access fault +** access to PMP region without execute permissions + +* illegal instruction: +** unimplemented CSRs +** unsupported extensions + +* breakpoint (`EBREAK`) + +* load address misaligned: +** `LH` at 2n+1 address +** `LW` at 4n+1, 4n+2, 4n+3 address + +* load access fault +** access to PMP region without read permissions + +* store/AMO address misaligned +** `SH` at 2n+1 address +** `SW` at 4n+1, 4n+2, 4n+3 address + +* store/AMO access fault +** access to PMP region without write permissions + +* environment call (`ECALL`) from U-mode + +* environment call (`ECALL`) from S-mode + +* environment call (`ECALL`) from M-mode + +* instruction page fault + +* load page fault +** access to effective address without read permissions + +* store/AMO page fault +** access to effective address without write permissions + +* debug request (custom) via debug interface + +Note: all exceptions are supported except the ones linked to the hypervisor extension + +[[trap-return]] +Trap return +^^^^^^^^^^^ + +Trap handler ends with trap return instruction (`MRET`, `SRET`). The behaviour of the CVA6 core depends on several CSRs. + +[[configuration-csrs-1]] +Configuration CSRs +++++++++++++++++++ + +CSRs having an effect on the core behaviour when returning from a trap are: + +* `mstatus`: several fields control the core behaviour like previous privilege mode (`MPP`, `SPP`), previous interrupt enabled (`MPIE`, `SPIE`) + +[[modified-csrs-1]] +Modified CSRs ++++++++++++++ + +CSRs (or fields) updated by the core when returning from a trap are: + +* `mstatus`: several fields are updated like interrupt enable (`MIE`, `SIE`), modify privilege (`MPRV`) + +[[interrupts]] +Interrupts +^^^^^^^^^^ + +* external interrupt: `irq_i` signal +* software interrupt (inter-processor interrupt): `ipi_i` signal +* timer interrupt: `time_irq_i` signal +* debug interrupt: `debug_req_i` signal + +These signals are level sensitive. It means the interrupt is raised until it is cleared. + +The exception code field (`mcause` CSR) depends on the interrupt source. + +[[wait-for-interrupt]] +Wait for Interrupt +^^^^^^^^^^^^^^^^^^ + +* CVA6 implementation: `WFI` stalls the core. The instruction is not available in U-mode (raise illegal instruction exception). Such exception is also raised when `TW=1` in `mstatus`. diff --git a/docs/04_cv32a65x/design/source/architecture.rst b/docs/design/design-manual/source/architecture.adoc similarity index 57% rename from docs/04_cv32a65x/design/source/architecture.rst rename to docs/design/design-manual/source/architecture.adoc index 8ddd680a17..3c4399b29e 100644 --- a/docs/04_cv32a65x/design/source/architecture.rst +++ b/docs/design/design-manual/source/architecture.adoc @@ -1,4 +1,4 @@ -.. +//// Copyright 2022 Thales DIS design services SAS Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -6,42 +6,29 @@ You may obtain a copy of the License at https://solderpad.org/licenses/ Original Author: Jean-Roch COULON - Thales +//// - - +[[architecture-and-modules]] Architecture and Modules -======================== +------------------------ -The CV32A65X is fully synthesizable. It has been designed mainly for ASIC designs, but FPGA synthesis is supported as well. +The {ohg-config} is fully synthesizable. It has been designed mainly for ASIC designs, but FPGA synthesis is supported as well. For ASIC synthesis, the whole design is completely synchronous and uses positive-edge triggered flip-flops. The core occupies an area of about 80 kGE. The clock frequency can be more than 1GHz depending of technology. -The CV32A65X subsystem is composed of 8 modules. - -.. figure:: ../images/subsystems.png - :name: CV32A6 v0.1.0 modules - :align: center - :alt: +The {ohg-config} subsystem is composed of 8 modules. - CV32A65X modules +image:subsystems.png[{ohg-config} modules] Connections between modules are illustrated in the following block diagram. FRONTEND, DECODE, ISSUE, EXECUTE, COMMIT and CONTROLLER are part of the pipeline. And CACHES implements the instruction and data caches and CSRFILE contains registers. -.. figure:: ../images/CV32A65X_subsystems.png - :name: CV32A65X subsystem - :align: center - :alt: - - CV32A65X pipeline and modules - -.. toctree:: - :hidden: +image:{ohg-config}_subsystems.png[{ohg-config} pipeline and modules] - cv32a6_frontend - cva6_id_stage - cva6_issue_stage - cv32a6_execute - cva6_commit_stage - cva6_controller - cva6_csr_regfile - cva6_caches +include::cva6_frontend.adoc[] +include::cva6_id_stage.adoc[] +include::cva6_issue_stage.adoc[] +include::cva6_execute.adoc[] +include::cva6_commit_stage.adoc[] +include::cva6_controller.adoc[] +include::cva6_csr_regfile.adoc[] +include::cva6_caches.adoc[] diff --git a/docs/04_cv32a65x/design/source/cva6_caches.rst b/docs/design/design-manual/source/cva6_caches.adoc similarity index 68% rename from docs/04_cv32a65x/design/source/cva6_caches.rst rename to docs/design/design-manual/source/cva6_caches.adoc index 21e1d7fec8..1e9fcb2b9d 100644 --- a/docs/04_cv32a65x/design/source/cva6_caches.rst +++ b/docs/design/design-manual/source/cva6_caches.adoc @@ -1,4 +1,4 @@ -.. +//// Copyright 2024 Thales DIS France SAS Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); you may not use this file except in compliance with the License. @@ -6,36 +6,33 @@ You may obtain a copy of the License at https://solderpad.org/licenses/ Original Author: Jean-Roch COULON - Thales +//// -.. _CVA6_CACHES: - +[[CVA6_CACHES]] CACHES Module -============= +~~~~~~~~~~~~~ +[[caches-description]] Description ------------ +^^^^^^^^^^^ -The CACHES module implements an instruction cache, a data cache and an AXI adapter. +The CACHES module implements an instruction cache, a data cache and an +AXI adapter. The module is connected to: * TO_BE_COMPLETED -.. include:: port_cva6_hpdcache_subsystem.rst - +include::port_cva6_hpdcache_subsystem.adoc[] +[[caches-functionality]] Functionality -------------- +^^^^^^^^^^^^^ TO BE COMPLETED - +[[caches-submodules]] Submodules ----------- - -.. figure:: ../images/caches.png - :name: CACHES submodules - :align: center - :alt: +^^^^^^^^^^ - CACHES submodules +image:caches.png[CACHES submodules] diff --git a/docs/design/design-manual/source/cva6_commit_stage.adoc b/docs/design/design-manual/source/cva6_commit_stage.adoc new file mode 100644 index 0000000000..99611f2ce9 --- /dev/null +++ b/docs/design/design-manual/source/cva6_commit_stage.adoc @@ -0,0 +1,42 @@ +//// + Copyright 2024 Thales DIS France SAS + Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); + you may not use this file except in compliance with the License. + SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 + You may obtain a copy of the License at https://solderpad.org/licenses/ + + Original Author: Jean-Roch COULON - Thales +//// + +[[CVA6_COMMIT_STAGE]] +COMMIT_STAGE Module +~~~~~~~~~~~~~~~~~~~ + +[[commit_stage-description]] +Description +^^^^^^^^^^^ + +The COMMIT_STAGE module implements the commit stage, which is the last +stage in the processor’s pipeline. For the instructions for which the +execution is completed, it updates the architectural state: writing CSR +registers, committing stores and writing back data to the register file. +The commit stage controls the stalling and the flushing of the +processor. + +The commit stage also manages the exceptions. An exception can occur +during the first four pipeline stages (PCgen cannot generate an +exception) or happen in commit stage, coming from the CSR_REGFILE or +from an interrupt. Exceptions are precise: they are considered during +the commit only and associated with the related instruction. + +The module is connected to: + +* TO BE COMPLETED + +include::port_commit_stage.adoc[] + +[[commit_stage-functionality]] +Functionality +^^^^^^^^^^^^^ + +TO BE COMPLETED diff --git a/docs/04_cv32a65x/design/source/cva6_controller.rst b/docs/design/design-manual/source/cva6_controller.adoc similarity index 76% rename from docs/04_cv32a65x/design/source/cva6_controller.rst rename to docs/design/design-manual/source/cva6_controller.adoc index 468970856e..6be2f70285 100644 --- a/docs/04_cv32a65x/design/source/cva6_controller.rst +++ b/docs/design/design-manual/source/cva6_controller.adoc @@ -1,4 +1,4 @@ -.. +//// Copyright 2024 Thales DIS France SAS Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); you may not use this file except in compliance with the License. @@ -6,14 +6,15 @@ You may obtain a copy of the License at https://solderpad.org/licenses/ Original Author: Jean-Roch COULON - Thales +//// -.. _CVA6_CONTROLLER: - +[[CVA6_CONTROLLER]] CONTROLLER Module -================= +~~~~~~~~~~~~~~~~~ +[[controller-description]] Description ------------ +^^^^^^^^^^^ The CONTROLLER module implements ... TO BE COMPLETED @@ -21,10 +22,10 @@ The module is connected to: * TO BE COMPLETED -.. include:: port_controller.rst +include::port_controller.adoc[] +[[controller-functionality]] Functionality -------------- +^^^^^^^^^^^^^ TO BE COMPLETED - diff --git a/docs/04_cv32a65x/design/source/cva6_csr_regfile.rst b/docs/design/design-manual/source/cva6_csr_regfile.adoc similarity index 76% rename from docs/04_cv32a65x/design/source/cva6_csr_regfile.rst rename to docs/design/design-manual/source/cva6_csr_regfile.adoc index bdc6b35487..a79bd9bf93 100644 --- a/docs/04_cv32a65x/design/source/cva6_csr_regfile.rst +++ b/docs/design/design-manual/source/cva6_csr_regfile.adoc @@ -1,4 +1,4 @@ -.. +//// Copyright 2024 Thales DIS France SAS Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); you may not use this file except in compliance with the License. @@ -6,14 +6,15 @@ You may obtain a copy of the License at https://solderpad.org/licenses/ Original Author: Jean-Roch COULON - Thales +//// -.. _CVA6_CSR_REGFILE: - +[[CVA6_CSR_REGFILE]] CSR_REGFILE Module -================== +~~~~~~~~~~~~~~~~~~ +[[csr_regfile-description]] Description ------------ +^^^^^^^^^^^ The CSR_REGFILE module implements ... TO BE COMPLETED @@ -21,9 +22,10 @@ The module is connected to: * TO BE COMPLETED -.. include:: port_csr_regfile.rst +include::port_csr_regfile.adoc[] +[[csr_regfile-functionality]] Functionality -------------- +^^^^^^^^^^^^^ TO BE COMPLETED diff --git a/docs/04_cv32a65x/design/source/cv32a6_execute.rst b/docs/design/design-manual/source/cva6_execute.adoc similarity index 57% rename from docs/04_cv32a65x/design/source/cv32a6_execute.rst rename to docs/design/design-manual/source/cva6_execute.adoc index c411d44eda..55f98719cf 100644 --- a/docs/04_cv32a65x/design/source/cv32a6_execute.rst +++ b/docs/design/design-manual/source/cva6_execute.adoc @@ -1,12 +1,11 @@ -.. _CVA6_EX_STAGE: +[[CVA6_EX_STAGE]] -############### EX_STAGE Module -############### +~~~~~~~~~~~~~~~ -*********** +[[ex_stage-description]] Description -*********** +^^^^^^^^^^^ The EX_STAGE module is a logical stage which implements the execute stage. It encapsulates the following functional units: ALU, Branch Unit, CSR buffer, Mult, load and store and CVXIF. @@ -16,89 +15,73 @@ The module is connected to: * ID_STAGE module provides scoreboard entry. * -.. include:: port_ex_stage.rst - -************* +[[ex_stage-functionality]] Functionality -************* +^^^^^^^^^^^^^ TO BE COMPLETED -********** +[[ex_stage-submodules]] Submodules -********** - -.. figure:: ../images/ex_stage_modules.png - :name: EX_STAGE submodules - :align: center - :alt: - - EX_STAGE submodules +^^^^^^^^^^ +image:ex_stage_modules.png[EX_STAGE submodules] +[[alu]] alu -=== ++++ The arithmetic logic unit (ALU) is a small piece of hardware which performs 32 and 64-bit arithmetic and bitwise operations: subtraction, addition, shifts, comparisons... It always completes its operation in a single cycle. -.. include:: port_alu.rst - +include::port_alu.adoc[] +[[branch_unit]] branch_unit -=========== ++++++++++++ The branch unit module manages all kinds of control flow changes i.e.: conditional and unconditional jumps. It calculates the target address and decides whether to take the branch or not. It also decides if a branch was mis-predicted or not and reports corrective actions to the pipeline stages. -.. include:: port_branch_unit.rst - +include::port_branch_unit.adoc[] +[[csr_buffer]] CSR_buffer -========== +++++++++++ The CSR buffer module stores the CSR address at which the instruction is going to read/write. As the CSR instruction alters the processor architectural state, this instruction has to be buffered until the commit stage decides to execute the instruction. -.. include:: port_csr_buffer.rst - +include::port_csr_buffer.adoc[] +[[mult]] mult -==== +++++ The multiplier module supports the division and multiplication operations. -.. figure:: ../images/mult_modules.png - :name: mult submodules - :align: center - :alt: - - mult submodules - -.. include:: port_mult.rst +image:mult_modules.png[mult submodules] +include::port_mult.adoc[] ----------- -multiplier ----------- +[[multiplier]] +====== multiplier Multiplication is performed in two cycles and is fully pipelined. -.. include:: port_multiplier.rst +include::port_multiplier.adoc[] - ------- -serdiv ------- +[[serdiv]] +====== serdiv The division is a simple serial divider which needs 64 cycles in the worst case. -.. include:: port_serdiv.rst - +include::port_serdiv.adoc[] +[[load_store_unit-lsu]] load_store_unit (LSU) -===================== ++++++++++++++++++++++ The load store module interfaces with the data cache (D$) to manage the load and store operations. @@ -106,19 +89,12 @@ The LSU does not handle misaligned accesses. Misaligned accesses are double word accesses which are not aligned to a 64-bit boundary, word accesses which are not aligned to a 32-bit boundary and half word accesses which are not aligned on 16-bit boundary. If the LSU encounters a misaligned load or store, it throws a misaligned exception. -.. figure:: ../images/load_store_unit_modules.png - :name: load_store_unit submodules - :align: center - :alt: - - load_store_unit submodules - -.. include:: port_load_store_unit.rst +image:load_store_unit_modules.png[load_store_unit submodules] +include::port_load_store_unit.adoc[] ----------- -store_unit ----------- +[[store_unit]] +====== store_unit The store_unit module manages the data store operations. @@ -132,40 +108,43 @@ When commit buffer is not empty, the buffer automatically tries to write the old Furthermore, the store_unit module provides information to the load_unit to know if an outstanding store matches addresses with a load. -.. include:: port_store_unit.rst +include::port_store_unit.adoc[] +[[load_unit]] +====== load_unit ---------- -load_unit ---------- - -The load_unit module manages the data load operations. +The load unit module manages the data load operations. Before issuing a load, the load unit needs to check the store buffer for potential aliasing. -It inserts stalls until it can satisfy the current request. This means: +It stalls until it can satisfy the current request. This means: * Two loads to the same address are allowed. * Two stores to the same address are allowed. -* A store followed by a load to the same address can only be satisfied if the store has already been committed (marked as committed in the store buffer). - -.. TO_BE_COMPLETED, But once the store is committed, do we do forwarding without waiting for the store to actually be finished? Or do we authorize the outcome of the load, which will be carried out in memory/cache? +* A store after a load to the same address is allowed. +* A load after a store to the same address can only be processed if the store has already been sent to the cache i.e there is no fowarding. -.. include:: port_load_unit.rst +After the check of the store buffer, a read request is sent to the D$ with the index field of the address (1). +The load unit stalls until the D$ acknowledges this request (2). +In the next cycle, the tag field of the address is sent to the D$ (3). +If the load request address is non-idempotent, it stalls until the write buffer of the D$ is empty of non-idempotent requests and the store buffer is empty. +It also stalls until the incoming load instruction is the next instruction to be committed. +When the D$ allows the read of the data, the data is sent to the load unit and the load instruction can be committed (4). +image:schema_fsm_load_control.png[Load unit's interactions] ----------- -lsu_bypass ----------- +include::port_load_unit.adoc[] -TO BE COMPLETED +[[lsu_bypass]] +====== lsu_bypass -.. include:: port_lsu_bypass.rst +The LSU bypass is a FIFO which keeps instructions from the issue stage when the store unit or the load unit are not available immediately. +include::port_lsu_bypass.adoc[] +[[cvxif_fu]] CVXIF_fu -======== +++++++++ TO BE COMPLETED -.. include:: port_cvxif_fu.rst - +include::port_cvxif_fu.adoc[] diff --git a/docs/design/design-manual/source/cva6_frontend.adoc b/docs/design/design-manual/source/cva6_frontend.adoc new file mode 100644 index 0000000000..dc6cb6bb36 --- /dev/null +++ b/docs/design/design-manual/source/cva6_frontend.adoc @@ -0,0 +1,373 @@ +//// + Copyright 2021 Thales DIS design services SAS + Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0 + You may obtain a copy of the License at https://solderpad.org/licenses/ + + Original Author: Jean-Roch COULON - Thales +//// + +[[CVA6_FRONTEND]] + +FRONTEND Module +~~~~~~~~~~~~~~~ + +[[frontend-description]] +Description +^^^^^^^^^^^ + +The FRONTEND module implements two first stages of the cva6 pipeline, +PC gen and Fetch stages. + +PC gen stage is responsible for generating the next program counter. +It hosts a Branch Target Buffer (BTB), a Branch History Table (BHT) and a Return Address Stack (RAS) to speculate on control flow instructions. + +Fetch stage requests data to the CACHE module, realigns the data to store them in instruction queue and transmits the instructions to the DECODE module. +FRONTEND can fetch up to {instr-per-fetch} instructions per cycle, but DECODE module decodes up to {issue-width} instruction(s) per cycle. + +The module is connected to: + +* CACHES module provides fethed instructions to FRONTEND. +* DECODE module receives instructions from FRONTEND. +* CONTROLLER module can order to flush and to halt FRONTEND PC gen stage +* EXECUTE, CONTROLLER, CSR and COMMIT modules trigger PC jumping due to +a branch misprediction, an exception, a return from an exception, a +debug entry or a pipeline flush. They provides the PC next value. +* CSR module states about debug mode. + +include::port_frontend.adoc[] + +[[frontend-functionality]] +Functionality +^^^^^^^^^^^^^ + +[[pc-generation-stage]] +PC Generation stage +^^^^^^^^^^^^^^^^^^^ + +PC gen generates the next program counter. The next PC can originate from the following sources (listed in order of precedence): + +* *Reset state:* At reset, the PC is assigned to the boot address. + +* *Branch Prediction:* The fetched instruction is predecoded by the +instr_scan submodule. When the instruction is a control flow, three +cases are considered: ++ +________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________ +1. When the instruction is a JALR which corresponds to a return (rs1 = x1 or rs1 = x5). +RAS provides next PC as a prediction. +2. When the instruction is a JALR which *does not* correspond to areturn. +If BTB (Branch Target Buffer) returns a valid address, then BTBpredicts next PC. +Else JALR is not considered as a control flow instruction, which will generate a mispredict. +3. When the instruction is a conditional branch. +If BHT (Branch History table) returns a valid address, then BHT predicts next PC. +Else the prediction depends on the PC relative jump offset sign: if sign is negative the prediction is taken, otherwise the prediction is not taken. +________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________________ ++ +Then the PC gen informs the Fetch stage that it performed a prediction on the PC. + +* *Default:* The next {ifetch-len}-bit block is fetched. +PC Gen fetches word boundary {ifetch-len}-bits block from CACHES module. +And the fetch stage identifies the instructions from the {ifetch-len}-bits blocks. + +* *Mispredict:* Misprediction are feedbacked by EX_STAGE module. +In any case we need to correct our action and start fetching from the correct address. + +* *Replay instruction fetch:* When the instruction queue is full, the instr_queue submodule asks the fetch replay and provides the address to be replayed. + +* *Return from environment call:* When CSR requests a return from an environment call, next PC takes the value of the PC of the instruction after the one pointed to by the mepc CSR. + +* *Exception/Interrupt:* If an exception is triggered by CSR_REGISTER, next PC takes the value of the trap vector base address CSR. +ifdef::RVS-true,RVU-true[] +The trap vector base address can be different depending on whether the exception traps to S-Mode or M-Mode. +It is the purpose of the CSR Unit to figure out where to trap to and present the correct address to PC Gen. +endif::[] + +* *Pipeline starting fetching from COMMIT PC:* When the commit stage is halted by a WFI instruction or when the pipeline has been flushed due to CSR change, next PC takes the value of the PC coming from the COMMIT submodule. +As CSR instructions do not exist in a compressed form, PC is unconditionally incremented by 4. + +ifeval::[{DebugEn} == true] +* *Debug:* Debug has the highest order of precedence as it can interrupt any control flow requests. It also the only source of control flow change which can actually happen simultaneously to any other of the forced control flow changes. +The debug jump is requested by CSR. +The address to be jumped into is HW coded. +endif::[] + +All program counters are logical addressed. +ifeval::[{MmuPresent} == true] +If the logical to physical mapping changes, a ``fence.vm`` instruction should be used to flush the pipeline and TLBs. +endif::[] + +[[fetch-stage]] +Fetch Stage +^^^^^^^^^^^ + +Fetch stage controls the CACHE module by a handshaking protocol. +Fetched data is a {ifetch-len}-bit block with a word-aligned address. +A granted fetch is processed by the instr_realign submodule to produce instructions. +Then instructions are pushed into an internal instruction FIFO called instruction queue (instr_queue submodule). +This submodule stores the instructions with its associated address and sends them to the DECODE module. + +Before sending the instructions to the DECODE stage, the frontend calculates a prediction address in case of a JUMP or Branch. This predicted address is sent to the DECODE stage along with the instruction and its fetch address. +The prediction address is not valid if there is no prediction. +Instructions following a predicted taken control flow instruction are dropped. + +// TO_BE_COMPLETED MMU also feedback an exception, but not present in 65X + +Memory can feedback potential exceptions which can be bus errors, invalid accesses or instruction page faults. +The FRONTEND transmits the exception from CACHES to DECODE. + + + +[[submodules]] +Submodules +^^^^^^^^^^ + +image:frontend_modules.png[FRONTEND submodules] + +image:ZoominFrontend.png[FRONTEND submodule interconnections] + +[[instr_realign-submodule]] +Instr_realign submodule ++++++++++++++++++++++++ + +The {ifetch-len}-bit aligned block coming from the CACHE module enters the instr_realign submodule. +This submodule extracts the instructions from the {ifetch-len}-bit blocks. +Based on the fetch address and the fetched data, the instr_realign module extracts the valid instructions to be sent to the queue. +It is possible to fetch up to {instr-per-fetch} instructions per cycle when C extension is used. +A not-compressed instruction can be misaligned on the block size, interleaved with two cache blocks. +In that case, two cache accesses are needed to get the whole instruction. +The instr_realign submodule provides up to {instr-per-fetch} instructions per cycle when compressed extension is enabled, else one instruction per cycle. +Incomplete instruction is stored in instr_realign submodule until its second half is fetched. + +Below is a table that explains how the instr_realign works: + +* _C : compressed instruction_ +* _I : not compressed instruction_ +* _UI: Incomplete instruction stored in the instr_realign_ + +ifeval::[{Superscalar} == true] + +|========================================================== +|*Address[2:1]* |*Incomplete Access* |*64-48* |*48-32* |*32-16* |*16-0* +.13+^.^|0 + +.5+^.^|1 +|UI +2.+^.^|I +|UI +|C +2.+^.^|I +|UI +2.+^.^|I +|C |UI +|C |C |C |UI +|UI |C |C |UI + +.8+^.^|0 +|UI +2.+^.^|I +|C +|C +2.+^.^|I +|C +2.+^.^|I +|C +|C +|UI |C |C |C +|C |C |C |C +|UI |C +2.+^.^|I +|C |C +2.+^.^|I +2.+^.^|I +2.+^.^|I + +.5+^.^|1 +.5+^.^|__ +|* |UI +2.+^.^|I +|* |C +2.+^.^|I +|* +2.+^.^|I +|C +|* |UI |C |C +|* |C |C |C + +.3+^.^|2 +.3+^.^|__ +2.+^.^|* +|UI |C +2.+^.^|* +|C |C +2.+^.^|* +2.+^.^|I + +.2+^.^|3 +.2+^.^|__ +3.+^.^|* +|UI +3.+^.^|* +|C + +|========================================================== + +endif::[] +ifeval::[{Superscalar} == false] + +|========================================================== +|*Address[2:1]* |*Incomplete Access* |*32-16* |*16-0* +.5+^.^|0 + +.3+^.^|0 +2.+^.^|I +|C +|C +|UI +|C + +.2+^.^|1 +|UI +|UI +|C +|UI + +.2+^.^|1 +.2+^.^|__ +|* |C +|* |UI + +|========================================================== + +endif::[] + +The Instr_realign can be flushed when the frontend requests the cache to kill the incoming instruction, in this case the incomplete instruction is deleted. + +include::port_instr_realign.adoc[] + +[[instr_queue-submodule]] +Instr_queue submodule ++++++++++++++++++++++ + +The instr_queue receives mutliple instructions from instr_realign submodule to create a valid stream to be executed. +Frontend pushes instructions and all related information into the FIFO for storage, including details needed in case of a misprediction or exception: the instructions themselves, instruction control flow type, exception, exception address, and predicted address. +DECODE pops them when decode stage is ready and indicates to the FRONTEND the instruction has been consummed. + + +The instruction queue contains two FIFOs: one for instructions and one for addresses, which stores addresses in case of a prediction. +The instruction FIFO can hold up to 4×{INSTR_PER_FETCH} instructions, while the address FIFO can hold up to 2 addresses. +If the instruction FIFO is full, a replay request is sent to inform the fetch mechanism to replay the fetch. +If the address FIFO is full and there is a prediction, a replay request is sent to inform the fetch mechanism to replay the fetch, even if the instruction FIFO is not full. + +The instruction queue can be flushed by the flush signal coming from the CONTROLLER. + + +include::port_instr_queue.adoc[] + +[[instr_scan-submodule]] +Instr_scan submodule +++++++++++++++++++++ + +As compressed extension is enabled, {instr-per-fetch} instr_scan are instantiated to handle up to {instr-per-fetch} instructions per cycle. + +Each instr_scan submodule pre-decodes the fetched instructions coming from the instr_realign module, also calculate the immediate, instructions could be compressed or not. +The instr_scan submodule is a flox controler which provides the intruction type: branch, jump, return, jalr, imm, call or others. +These outputs are used by the branch prediction feature. + +include::port_instr_scan.adoc[] + +ifeval::[{BHTEntries} > 0] +[[bht-branch-history-table-submodule]] +BHT (Branch History Table) submodule +++++++++++++++++++++++++++++++++++++ + +The BHT is implemented as a memory which is composed of {BHTEntries} entries. +The BHT is a two-dimensional table: + +* The first dimension represents the access address, with a length equal to `{BHTEntries} / {INSTR_PER_FETCH}`. +* The second dimension represents the row index, with a length equal to `INSTR_PER_FETCH`. + +In the case of branch prediction, the BHT uses only part of the virtual address to get the value of the saturation counter. +In the case of a valid misprediction, the BHT uses only part of the misprediction address to access the BHT table and update the saturation counter. + +'UPPER_ADDRESS_INDEX = $clog2(BHTDepth) + ((RVC == 1) ? 1 : 2)' + +'LOWER_ADDRESS_INDEX = (RVC == 1) ? 1 : 2 + $clog2(INSTR_PER_FETCH)' + +`ACCESS_ADDRESS = PC/MISPREDICT_ADDRESS [ UPPER_ADDRESS_INDEX : LOWER_ADDRESS_INDEX ]` + +The lower address bits of the virtual address point to the memory entry. + +'UPPER_ADDRESS_INDEX = (RVC == 1) ? 1 : 2 + $clog2(INSTR_PER_FETCH)' + +'LOWER_ADDRESS_INDEX = (RVC == 1) ? 1 : 2 + $clog2(INSTR_PER_FETCH)' + +`ACCESS_INDEX = PC/MISPREDICT_ADDRESS [ UPPER_ADDRESS_INDEX : LOWER_ADDRESS_INDEX]` + +_Two distinct branches with different addresses can share the same BHT entry if they have the same ACCESS_ADDRESS._ + +Each BHT entry contains a two-bit saturating counter and a valid bit. +On reset, the counters are set to 0 (strongly not taken) and the valid bits are cleared. +When a branch instruction is resolved by the EX_STAGE module, the valid bit is set and the counter is updated. +The two bit counter is updated by the successive execution of the instructions as shown in the following figure. + +image:bht.png[BHT saturation] + +When a branch instruction is pre-decoded by instr_scan submodule, the BHT valids whether the PC address is inside the BHT and provides the taken or not prediction. +The prediction is the most significant bit from the counter, where 1 means "taken". + +When the Execute stage processes a branch instruction, it sends the branch status (whether it's taken or not) to the Frontend to update the BHT table + +ifeval::[{DebugEn} == true] +FIXME The BHT is not updated if processor is in debug mode. +endif::[] + + +The BHT is never flushed. + +include::port_bht.adoc[] +endif::[] + +[[btb-branch-target-buffer-submodule]] +BTB (Branch Target Buffer) submodule +++++++++++++++++++++++++++++++++++++ + +ifeval::[{BTBEntries} > 0] +BTB is implemented as an array which is composed of {BTBEntries} entries. +The lower address bits of the virtual address point to the memory entry. + +When an JALR instruction is found mispredicted by the EX_STAGE module, the JALR PC and the target address are stored into the BTB. + +// TODO: Specify the behaviour when BTB is saturated + +// TODO: when debug enabled, The BTB is not updated if processor is in debug mode. + +When a JALR instruction is pre-decoded by instr_scan submodule, the BTB informs whether the input PC address is in the BTB. +In this case, the BTB provides the predicted target address. + +The BTB is never flushed. + +include::port_btb.adoc[] +endif::[] +ifeval::[{BTBEntries} == 0] +There is no BTB in {ohg-config}. +As a consequence, no valid address is returned from BTB. +endif::[] + +ifeval::[{RASDepth} > 0] +[[ras-return-address-stack-submodule]] +RAS (Return Address Stack) submodule +++++++++++++++++++++++++++++++++++++ + +RAS is implemented as a LIFO which is composed of {RASDepth} entries. + +When a "call" JAL instruction (rd = x1 or x5) is added to the instruction queue, the PC of the instruction following the JAL instruction is pushed into the stack. + +When a "ret" JALR instruction (rs1 = x1 or x5, and rd != rs1) is added to the instruction queue, the predicted return address is popped from the stack. +If the predicted return address is wrong due for instance to speculation or RAS depth limitation, a mispredict will be generated. + +The RAS is never flushed. + +include::port_ras.adoc[] +endif::[] diff --git a/docs/design/design-manual/source/cva6_id_stage.adoc b/docs/design/design-manual/source/cva6_id_stage.adoc new file mode 100644 index 0000000000..3d2a1d2e1b --- /dev/null +++ b/docs/design/design-manual/source/cva6_id_stage.adoc @@ -0,0 +1,88 @@ +//// + Copyright 2024 Thales DIS France SAS + Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); + you may not use this file except in compliance with the License. + SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 + You may obtain a copy of the License at https://solderpad.org/licenses/ + + Original Author: Jean-Roch COULON - Thales +//// + +[[CVA6_ID_STAGE]] +ID_STAGE Module +~~~~~~~~~~~~~~~ + +[[id_stage-description]] +Description +^^^^^^^^^^^ + +The ID_STAGE module implements the decode stage of the pipeline. Its +main purpose is to decode RISC-V instructions coming from FRONTEND +module (fetch stage) and send them to the ISSUE_STAGE module (issue +stage). + +The compressed_decoder module checks whether the incoming instruction is +compressed and output the corresponding uncompressed instruction. Then +the decoder module decodes the instruction and send it to the issue +stage. + +The module is connected to: + +* CONTROLLER module can request to flush the buffer at the end of ID_STAGE +* FRONTEND module sends instructions to ID_STAGE module +* ISSUE module receives the decoded instructions from ID_STAGE module +* CSR_REGFILE module sends status information about privilege mode, +traps, extension support. + +include::port_id_stage.adoc[] + +[[id_stage-functionality]] +Functionality +^^^^^^^^^^^^^ + +ID_STAGE transforms each instruction into a scoreboard entry whose fields indicate what the instruction does. +It receives external interrupts and, according to the interrupts configuration from CSR_REGFILE, it inserts exceptions in the pipeline. +ifeval::[{SuperscalarEn} == true] +It outputs scoreboard entries via a FIFO of depth 2, which can push and pull up to 2 instructions each cycle. +endif::[] + +[[id_stage-submodules]] +Submodules +^^^^^^^^^^ + +image:id_stage_modules.png[ID_STAGE submodules] + +[[compressed_decoder]] +Compressed_decoder +++++++++++++++++++ + +The compressed_decoder module decompresses all the compressed +instructions taking a 16-bit compressed instruction and expanding it to +its 32-bit equivalent. All compressed instructions have a 32-bit +equivalent. + +Non-compressed instructions on the input are transmitted as-is. + +include::port_compressed_decoder.adoc[] + +[[decoder]] +Decoder ++++++++ + +The decoder module takes the output of compressed_decoder module and +decodes it. It transforms the instruction to the most fundamental +control structure in pipeline, a scoreboard entry. + +The scoreboard entry contains an exception entry which is composed of a +valid field, a cause and a value called TVAL. As TVALEn configuration +parameter is zero, the TVAL field is not implemented. + +A potential illegal instruction exception can be detected during +decoding. If no exception has happened previously in fetch stage, the +decoder will valid the exception and add the cause and tval value to the +scoreboard entry. + +A potential interrupt can be sent to the decoder. +If no exception has happened previously in fetch stage, the exception is inserted to into the scoreboard entry. + +include::port_decoder.adoc[] diff --git a/docs/design/design-manual/source/cva6_issue_stage.adoc b/docs/design/design-manual/source/cva6_issue_stage.adoc new file mode 100644 index 0000000000..ab8a1c08e8 --- /dev/null +++ b/docs/design/design-manual/source/cva6_issue_stage.adoc @@ -0,0 +1,247 @@ +//// + Copyright 2024 Thales DIS France SAS + Licensed under the Solderpad Hardware License, Version 2.1 (the "License"); + you may not use this file except in compliance with the License. + SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 + You may obtain a copy of the License at https://solderpad.org/licenses/ + + Original Author: Jean-Roch COULON - Thales +//// + +[[CVA6_ISSUE_STAGE]] +ISSUE_STAGE Module +~~~~~~~~~~~~~~~~~~ + +[[issue_stage-description]] +Description +^^^^^^^^^^^ + +ISSUE_STAGE issues instructions (1), reorders their results (2) and sends completed instructions in-order to COMMIT_STAGE (3). + +(1) ISSUE_STAGE issues instructions in-order. +It makes sure that instructions from ID_STAGE have everything they need to run. +It waits until all requirements are met. +Once an instruction is ready to run, ISSUE_STAGE sends it to EX_STAGE with its operands. + +(2) ISSUE_STAGE reorders instructions results. +It gets results of instruction executions out-of-order from EX_STAGE. +ISSUE_STAGE stores these results reordered. + +(3) ISSUE_STAGE sends completed instructions in-order to COMMIT_STAGE. +This is where architectural state is modified. + +Scoreboard module keeps track of instructions and their results. +Issue_read_operands module contains all the issue logic and the register file. + +The module is connected to: + +* CONTROLLER module can request to flush the pipeline buffer at the end of ISSUE_STAGE. + CONTROLLER module can also request to flush the whole Scoreboard. +* ID_STAGE module delivers decoded instructions to ISSUE_STAGE. +* EX_STAGE module gets instructions issued by ISSUE_STAGE to execute them. + EX_STAGE module also returns results to ISSUE_STAGE. +* COMMIT_STAGE module delivers ISSUE_STAGE clearance to remove the oldest instruction from Scoreboard. + +include::port_issue_stage.adoc[] + +[[issue_stage-functionality]] +Functionality +^^^^^^^^^^^^^ + +ISSUE_STAGE has three functionalities. + +(1) ISSUE_STAGE issues instructions. +Instructions from ID_STAGE are sent to Scoreboard module, which forwards them to Issue_read_operands module. +Issue_read_operands queries Scoreboard module for data dependences (Scoreboard is also able to return forwarded values) and gets the list of busy functional units from EX_STAGE. +Issue_read_operands sends to EX_STAGE the instructions to execute and acknowledges to Scoreboard so that it stores them: the instruction is issued. +Issued instructions are acknowledged to ID_STAGE. +Each of these steps can block its successors. +The flush signal from CONTROLLER module is also sent to Scoreboard module to prevent from issuing. +Instructions are issued in-order: an instruction cannot be issued unless all its predecessors are issued. + +(2) ISSUE_STAGE reorders instructions results. +Results from EX_STAGE are sent to Scoreboard module so that they are stored. + +(3) ISSUE_STAGE sends completed instructions in-order to COMMIT_STAGE. +The oldest instructions from Scoreboard are exposed to COMMIT_STAGE. +When COMMIT_STAGE acknowledges a commit, the committed instruction is removed from Scoreboard and the register file in Issue_read_operands is updated with the instruction result. + +[[issue_stage-submodules]] +Submodules +^^^^^^^^^^ + +image:issue_stage_modules.png[ISSUE_STAGE submodules] + +[[scoreboard]] +Scoreboard +++++++++++ + +Scoreboard contains a FIFO which contains an entry for each issued instruction. +Each entry is removed once the instruction is committed. +Instruction results are inserted into Scoreboard when they are ready. +The FIFO is flushed when requested by CONTROLLER. + +Scoreboard is used in all three functionalities of ISSUE_STAGE. + +(1) ISSUE_STAGE issues instructions. +Up to {issue-width} instruction(s) can be received from ID_STAGE each cycle. +They are transmitted to Issue_read_operands with incremental transaction IDs which wrap at {NrScoreboardEntries}. +The result buses and Scoreboard entries are also transmitted to Issue_read_operands for it to detect data dependences and perform operand forwarding. +When Issue_read_operands acknowledges an instruction, it is inserted into the FIFO. + +Scoreboard has a capacity of {NrScoreboardEntries} entries. +Instructions which would make Scoreboard overflow are not transmitted to Issue_read_operands (ISSUE_STAGE stalls). + +The flush signal from CONTROLLER module removes all entries from the Scoreboard and prevents from issuing. +The transaction ID of the next issued instruction is 0. + +(2) ISSUE_STAGE reorders instructions results. +Results are returned from functional units in the EX_STAGE to Scoreboard via result buses, with their transaction IDs. +Scoreboard stores this result into the entry associated with this transaction ID. +If an exception is returned, it is stored too. +ifeval::[{SpeculativeSb} == true] +If the result is a branch miss, the speculative instructions following it are cancelled. +endif::[] + +ifeval::[{RVZCMP} == true] +FIXME Document behavior related to macro instructions +endif::[] + +ifeval::[{DebugEn} == true] +FIXME Document behavior related to debug +endif::[] + +ifeval::[{CvxifEn} == true] +FIXME Document behavior related to CV-X-IF +endif::[] + +ifeval::[{FpPresent} == true] +FIXME Document behavior related to FPU +endif::[] + +(3) ISSUE_STAGE sends completed instructions in-order to COMMIT_STAGE. +Each of the {commit-width} oldest entry(ies) in Scoreboard are exposed to COMMIT_STAGE, one per commit port. +This makes commit happen in-order. +When COMMIT_STAGE acknowledges on a commit port, the entry is removed from Scoreboard. + +include::port_scoreboard.adoc[] + +[[issue_read_operands]] +Issue_read_operands ++++++++++++++++++++ + +Issue_read_operands tracks hazards and gets the input operands for the instructions to execute. +The following hazards can prevent instructions from being issued. + +* Data hazards: ISSUE_STAGE checks that the instruction operands are available. + +** [[raw_hazard]] Read-After-Write (RAW): if one of the source registers of the instruction to issue is the destination register of one of the instructions in the Scoreboard, issue is blocked. +However, CVA6 implements operand forwarding: instead of blocking the instruction, the operand is taken from either + +a) a functional unit which returns a result which is not an exception, with a transaction ID which points to a Scoreboard entry whose destination register is the requested source register; + +b) a Scoreboard entry whose destination register is the requested source register has a result which is not an exception. + +Forwarding is not possible from CSR instructions. +ifeval::[{RVS} == true] +FIXME Document behavior related to SFENCE_VMA +endif::[] ++ +ifeval::[{SuperscalarEn} == true] +The instruction in the second issue port is not issued when one of its source registers is the destination register of the instruction in the first issue port. +No operand forwarding is possible between the two issue ports. +endif::[] + +** Write-After-Write (WAW): if the instruction to issue has the same destination register as one of the instructions in the Scoreboard, issue is blocked. +Instructions being committed are ignored because the will not be in the scoreboard anymore since the next cycle. ++ +ifeval::[{SuperscalarEn} == true] +The instruction in the second issue port is not issued when it has the same destination register as the instruction in the first issue port. +endif::[] + +** Special case: there are no data hazards on `x0`. + +ifeval::[{SpeculativeSb} == true] +** Cancelled instructions are ignored by the hazard detection mechanism. +endif::[] + +ifeval::[{CvxifEn} == true] +** FIXME hazards related to CV-X-IF +endif::[] + +* Structural hazards: ISSUE_STAGE checks that a functional unit (FU) and its result bus (RB) are ready to execute the instruction. + +** Integer division instructions and some^[FIXME{sp}which?]^ CSR instructions have an unknown latency. +When EX_STAGE reports that such an instruction is running, instructions using ALU, BRANCH, CSR or MULT are blocked. +This is to avoid conflicts on the RB shared by these four FUs. +ifeval::[{SuperscalarEn} == true] +Note that ALU2 might still be available. +endif::[] + +** Multiplications have a fixed latency of 2 cycles. +Instructions using ALU, BRANCH or CSR are blocked if an instruction using MULT was issued one cycle earlier. +This is to avoid conflicts on the RB shared by these four FUs. +ifeval::[{SuperscalarEn} == true] +Note that ALU2 might still be available. +endif::[] +Instructions using MULT are not blocked because the multiplier is pipelined and can accept one instruction each cycle. + +ifeval::[{FpPresent} == true] +** FIXME hazards related to FPU +endif::[] + +** Instructions using LSU are blocked if LSU is not ready. + +ifeval::[{SuperscalarEn} == true] +** The second issue port cannot issue instructions using CSR. + +ifeval::[{CvxifEn} == true] +** The second issue port cannot issue instructions using CV-X-IF. +endif::[] + +** The second issue port cannot issue an instruction using a FU when the first port issues an instruction using the same FU. +This is because a FU can only accept one instruction each cycle. + +** When both ALU and ALU2 are available, ALU instructions use ALU2. + +** The second issue port cannot issue an instruction using ALU, BRANCH or CSR when the first issue port is issuing an instruction using ALU, BRANCH or CSR. +This is to avoid conflicts on the RB shared by these four FUs. +Note that ALU2 might still be available. + +* Control hazards: ISSUE_STAGE checks that no instruction is executed speculatively while it must not. + +** The second issue port is completely blocked when the first port issues an instruction using CSR. + +** The second issue port +ifeval::[{SpeculativeSb} == true] +cannot issue an instruction using LSU +endif::[] +ifeval::[{SpeculativeSb} == false] +is completely blocked +endif::[] +when the first issue port issues an instruction using BRANCH, unless it is a JAL. + +// {SuperscalarEn} == true +endif::[] + +Data hazards are ignored when an exception occurred earlier in the pipeline. +As no FU is involved, there are no structural hazards either. + +Instructions are issued in-order, which means that when an instruction makes ISSUE_STAGE stall, next instructions are blocked. +ifeval::[{SuperscalarEn} == true] +As a consequence, it is not possible to issue the instruction in the second issue port when the instruction in the first issue port is blocked. +endif::[] + +ifeval::[{SuperscalarEn} == true] +However, it is possible that the instruction in the first issue port is issued while the instruction in the second issue port is not. +In such case, ID_STAGE moves the instruction in the second issue port to the first issue port so that it is issued next. +endif::[] + +The input operands provided to EX_STAGE come from the register file by default. +However, when one of the source registers has a RAW dependence, the corresponding input operand is replaced by the forwarded value (see <> above). +The register file is an instance of `ariane_regfile` where ach register stores {XLEN} bits and the register at index 0 is wired to zero. + +ifeval::[{CvxifEn} == true] +FIXME Document behavior related to CV-X-IF +endif::[] + +Instructions are sent to EX_STAGE via a register so they are visible in EX_STAGE one cycle after being issued. + +include::port_issue_read_operands.adoc[] diff --git a/docs/design/design-manual/source/design.adoc b/docs/design/design-manual/source/design.adoc new file mode 100644 index 0000000000..169f990437 --- /dev/null +++ b/docs/design/design-manual/source/design.adoc @@ -0,0 +1,71 @@ +//// + Copyright (c) 2022 Thales + Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0 + You may obtain a copy of the License at https://solderpad.org/licenses/ + + Original Author: Jean-Roch COULON - Thales +//// + +include::config.adoc[] +include::config_define.adoc[] + +ifeval::[{SuperscalarEn} == true] +:ifetch-len: 64 +:instr-per-fetch: 4 +:issue-width: 2 +:commit-width: 2 +endif::[] +ifeval::[{SuperscalarEn} == false] +:ifetch-len: 32 +:instr-per-fetch: 2 +:issue-width: 1 +:commit-width: {NrCommmitPorts} +endif::[] +:SpeculativeSb: {SuperscalarEn} + +:FpPresent: false +ifeval::[{RVF} == true] +:FpPresent: true +endif::[] +ifeval::[{RVD} == true] +:FpPresent: true +endif::[] +ifeval::[{XF16} == true] +:FpPresent: true +endif::[] +ifeval::[{XF16ALT} == true] +:FpPresent: true +endif::[] +ifeval::[{XF8} == true] +:FpPresent: true +endif::[] + +[[DesignDocument]] += Design Documentation for {ohg-config} architecture +:description: Design documentation for {ohg-config} +:company: THALES +:doctype: book +:sectnums: +:sectnumlevels: 5 +:toc: left +:toclevels: 4 +:table-caption: Table +:figure-caption: Figure +:xrefstyle: short +:imagesdir: images +:example-caption: Example +:listing-caption: Listing +:chapter-refsig: Chapter +:section-refsig: Section +:appendix-refsig: Appendix +:data-uri: + +Editor: *Jean Roch Coulon* + +include::intro.adoc[] +include::subsystem.adoc[] +include::functionality.adoc[] +include::architecture.adoc[] +include::glossary.adoc[] diff --git a/docs/04_cv32a65x/design/source/functionality.rst b/docs/design/design-manual/source/functionality.adoc similarity index 70% rename from docs/04_cv32a65x/design/source/functionality.rst rename to docs/design/design-manual/source/functionality.adoc index 71198e13ee..b4bd673f42 100644 --- a/docs/04_cv32a65x/design/source/functionality.rst +++ b/docs/design/design-manual/source/functionality.adoc @@ -1,4 +1,4 @@ -.. +//// Copyright 2023 Thales DIS France SAS Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -6,17 +6,14 @@ You may obtain a copy of the License at https://solderpad.org/licenses/ Original Author: Jean-Roch COULON - Thales +//// - - +[[functionality]] Functionality -============= - -.. toctree:: - :maxdepth: 1 +------------- - instructions - traps - CSRs - AXI - CVXIF +include::instructions.adoc[] +include::traps.adoc[] +include::CSRs.adoc[] +include::AXI.adoc[] +include::CVXIF.adoc[] diff --git a/docs/design/design-manual/source/glossary.adoc b/docs/design/design-manual/source/glossary.adoc new file mode 100644 index 0000000000..a484f9d3f8 --- /dev/null +++ b/docs/design/design-manual/source/glossary.adoc @@ -0,0 +1,64 @@ +[[glossary]] +Glossary +-------- + +* *ALU*: Arithmetic/Logic Unit +* *APU*: Application Processing Unit +* *ASIC*: Application-Specific Integrated Circuit +* *AXI*: Advanced eXtensible Interface +* *BHT*: Branch History Table +* *BTB*: Branch Target Buffer +* *Byte*: 8-bit data item +* *CPU*: Central Processing Unit, processor +* *CSR*: Control and Status Register +* *Custom extension*: Non-Standard extension to the RISC-V base +instruction set (RISC-V Instruction Set Manual, Volume I: User-Level +ISA) +* *CVA6*: Core-V Application class processor with a 6 stage pipeline +* *D$*: Data Cache +* *DPI*: Direct Programming Interface +* *EX* or *EXE*: Instruction Execute +* *FPGA*: Field Programmable Gate Array +* *FPU*: Floating Point Unit +* *Halfword*: 16-bit data item +* *Halfword aligned address*: An address is halfword aligned if it is +divisible by 2 +* *I$*: Instruction Cache +* *ID*: Instruction Decode +* *IF*: Instruction Fetch +* *ISA*: Instruction Set Architecture +* *KGE*: Kilo Gate Equivalents (NAND2) +* *LSU*: Load Store Unit +* *M-Mode*: Machine Mode (RISC-V Instruction Set Manual, Volume II: +Privileged Architecture) +* *MMU*: Memory Management Unit +* *NC*: Not Cacheable +* *OBI*: Open Bus Interface +* *OoO*: Out Of Order +* *PC*: Program Counter +* *PMP*: Physical memory protection (RISC-V Instruction Set Manual, +Volume II: Privileged Architecture) +* *PTW*: Page Table Walker +* *PULP platform*: Parallel Ultra Low Power Platform +() +* *RAS*: Return Address Stack +* *RV32C*: RISC-V Compressed (C extension) +* *RV32F*: RISC-V Floating Point (F extension) +* *S-Mode*: Supervisor Mode (RISC-V Instruction Set Manual, Volume II: +Privileged Architecture) +* *SIMD*: Single Instruction/Multiple Data +* *Standard extension*: Standard extension to the RISC-V base +instruction set (RISC-V Instruction Set Manual, Volume I: User-Level +ISA) +* *TLB*: Translation Lookaside Buffer +* *U-Mode*: User Mode (RISC-V Instruction Set Manual, Volume II: +Privileged Architecture) +* *VLEN*: Virtual address length +* *WARL*: Write Any Values, Reads Legal Values +* *WB*: Write Back of instruction results +* *WLRL*: Write/Read Only Legal Values +* *Word*: 32-bit data item +* *Word aligned address*: An address is word aligned if it is divisible +by 4 +* *WPRI*: Reserved Writes Preserve Values, Reads Ignore Values +* *XLEN*: RISC-V processor data length diff --git a/docs/04_cv32a65x/design/images/CVA6_subsystems.png b/docs/design/design-manual/source/images/CVA6_subsystems.png similarity index 100% rename from docs/04_cv32a65x/design/images/CVA6_subsystems.png rename to docs/design/design-manual/source/images/CVA6_subsystems.png diff --git a/docs/04_cv32a65x/design/images/LZC.png b/docs/design/design-manual/source/images/LZC.png similarity index 100% rename from docs/04_cv32a65x/design/images/LZC.png rename to docs/design/design-manual/source/images/LZC.png diff --git a/docs/04_cv32a65x/design/images/RR.png b/docs/design/design-manual/source/images/RR.png similarity index 100% rename from docs/04_cv32a65x/design/images/RR.png rename to docs/design/design-manual/source/images/RR.png diff --git a/docs/04_cv32a65x/design/images/ZoominFrontend.png b/docs/design/design-manual/source/images/ZoominFrontend.png similarity index 100% rename from docs/04_cv32a65x/design/images/ZoominFrontend.png rename to docs/design/design-manual/source/images/ZoominFrontend.png diff --git a/docs/04_cv32a65x/design/images/ariane_overview.drawio.png b/docs/design/design-manual/source/images/ariane_overview.drawio.png similarity index 100% rename from docs/04_cv32a65x/design/images/ariane_overview.drawio.png rename to docs/design/design-manual/source/images/ariane_overview.drawio.png diff --git a/docs/04_cv32a65x/design/images/ariane_overview.png b/docs/design/design-manual/source/images/ariane_overview.png similarity index 100% rename from docs/04_cv32a65x/design/images/ariane_overview.png rename to docs/design/design-manual/source/images/ariane_overview.png diff --git a/docs/04_cv32a65x/design/images/bht.png b/docs/design/design-manual/source/images/bht.png similarity index 100% rename from docs/04_cv32a65x/design/images/bht.png rename to docs/design/design-manual/source/images/bht.png diff --git a/docs/04_cv32a65x/design/images/caches.png b/docs/design/design-manual/source/images/caches.png similarity index 100% rename from docs/04_cv32a65x/design/images/caches.png rename to docs/design/design-manual/source/images/caches.png diff --git a/docs/04_cv32a65x/design/images/cva6_tlb_entry.png b/docs/design/design-manual/source/images/cva6_tlb_entry.png similarity index 100% rename from docs/04_cv32a65x/design/images/cva6_tlb_entry.png rename to docs/design/design-manual/source/images/cva6_tlb_entry.png diff --git a/docs/04_cv32a65x/design/images/cva6_tlb_hit.png b/docs/design/design-manual/source/images/cva6_tlb_hit.png similarity index 100% rename from docs/04_cv32a65x/design/images/cva6_tlb_hit.png rename to docs/design/design-manual/source/images/cva6_tlb_hit.png diff --git a/docs/04_cv32a65x/design/images/ex_stage_modules.png b/docs/design/design-manual/source/images/ex_stage_modules.png similarity index 100% rename from docs/04_cv32a65x/design/images/ex_stage_modules.png rename to docs/design/design-manual/source/images/ex_stage_modules.png diff --git a/docs/04_cv32a65x/design/images/frontend_modules.png b/docs/design/design-manual/source/images/frontend_modules.png similarity index 100% rename from docs/04_cv32a65x/design/images/frontend_modules.png rename to docs/design/design-manual/source/images/frontend_modules.png diff --git a/docs/04_cv32a65x/design/images/id_stage_modules.png b/docs/design/design-manual/source/images/id_stage_modules.png similarity index 100% rename from docs/04_cv32a65x/design/images/id_stage_modules.png rename to docs/design/design-manual/source/images/id_stage_modules.png diff --git a/docs/04_cv32a65x/design/images/in_out_tlb.png b/docs/design/design-manual/source/images/in_out_tlb.png similarity index 100% rename from docs/04_cv32a65x/design/images/in_out_tlb.png rename to docs/design/design-manual/source/images/in_out_tlb.png diff --git a/docs/04_cv32a65x/design/images/issue_stage_modules.png b/docs/design/design-manual/source/images/issue_stage_modules.png similarity index 100% rename from docs/04_cv32a65x/design/images/issue_stage_modules.png rename to docs/design/design-manual/source/images/issue_stage_modules.png diff --git a/docs/04_cv32a65x/design/images/load_store_unit_modules.png b/docs/design/design-manual/source/images/load_store_unit_modules.png similarity index 100% rename from docs/04_cv32a65x/design/images/load_store_unit_modules.png rename to docs/design/design-manual/source/images/load_store_unit_modules.png diff --git a/docs/04_cv32a65x/design/images/mmu_control_flow.png b/docs/design/design-manual/source/images/mmu_control_flow.png similarity index 100% rename from docs/04_cv32a65x/design/images/mmu_control_flow.png rename to docs/design/design-manual/source/images/mmu_control_flow.png diff --git a/docs/04_cv32a65x/design/images/mmu_in_out.png b/docs/design/design-manual/source/images/mmu_in_out.png similarity index 100% rename from docs/04_cv32a65x/design/images/mmu_in_out.png rename to docs/design/design-manual/source/images/mmu_in_out.png diff --git a/docs/04_cv32a65x/design/images/mmu_major_blocks.png b/docs/design/design-manual/source/images/mmu_major_blocks.png similarity index 100% rename from docs/04_cv32a65x/design/images/mmu_major_blocks.png rename to docs/design/design-manual/source/images/mmu_major_blocks.png diff --git a/docs/04_cv32a65x/design/images/mult_modules.png b/docs/design/design-manual/source/images/mult_modules.png similarity index 100% rename from docs/04_cv32a65x/design/images/mult_modules.png rename to docs/design/design-manual/source/images/mult_modules.png diff --git a/docs/04_cv32a65x/design/images/openhw-landscape.svg b/docs/design/design-manual/source/images/openhw-landscape.svg similarity index 100% rename from docs/04_cv32a65x/design/images/openhw-landscape.svg rename to docs/design/design-manual/source/images/openhw-landscape.svg diff --git a/docs/04_cv32a65x/design/images/plru_tree_indexing.png b/docs/design/design-manual/source/images/plru_tree_indexing.png similarity index 100% rename from docs/04_cv32a65x/design/images/plru_tree_indexing.png rename to docs/design/design-manual/source/images/plru_tree_indexing.png diff --git a/docs/04_cv32a65x/design/images/ptw_dptw.png b/docs/design/design-manual/source/images/ptw_dptw.png similarity index 100% rename from docs/04_cv32a65x/design/images/ptw_dptw.png rename to docs/design/design-manual/source/images/ptw_dptw.png diff --git a/docs/04_cv32a65x/design/images/ptw_dptw_s.png b/docs/design/design-manual/source/images/ptw_dptw_s.png similarity index 100% rename from docs/04_cv32a65x/design/images/ptw_dptw_s.png rename to docs/design/design-manual/source/images/ptw_dptw_s.png diff --git a/docs/04_cv32a65x/design/images/ptw_idle.png b/docs/design/design-manual/source/images/ptw_idle.png similarity index 100% rename from docs/04_cv32a65x/design/images/ptw_idle.png rename to docs/design/design-manual/source/images/ptw_idle.png diff --git a/docs/04_cv32a65x/design/images/ptw_in_out.png b/docs/design/design-manual/source/images/ptw_in_out.png similarity index 100% rename from docs/04_cv32a65x/design/images/ptw_in_out.png rename to docs/design/design-manual/source/images/ptw_in_out.png diff --git a/docs/04_cv32a65x/design/images/ptw_iptw.png b/docs/design/design-manual/source/images/ptw_iptw.png similarity index 100% rename from docs/04_cv32a65x/design/images/ptw_iptw.png rename to docs/design/design-manual/source/images/ptw_iptw.png diff --git a/docs/04_cv32a65x/design/images/ptw_mis_sup.png b/docs/design/design-manual/source/images/ptw_mis_sup.png similarity index 100% rename from docs/04_cv32a65x/design/images/ptw_mis_sup.png rename to docs/design/design-manual/source/images/ptw_mis_sup.png diff --git a/docs/04_cv32a65x/design/images/ptw_nlvl.png b/docs/design/design-manual/source/images/ptw_nlvl.png similarity index 100% rename from docs/04_cv32a65x/design/images/ptw_nlvl.png rename to docs/design/design-manual/source/images/ptw_nlvl.png diff --git a/docs/04_cv32a65x/design/images/ptw_pte_1.png b/docs/design/design-manual/source/images/ptw_pte_1.png similarity index 100% rename from docs/04_cv32a65x/design/images/ptw_pte_1.png rename to docs/design/design-manual/source/images/ptw_pte_1.png diff --git a/docs/04_cv32a65x/design/images/ptw_pte_flowchart.png b/docs/design/design-manual/source/images/ptw_pte_flowchart.png similarity index 100% rename from docs/04_cv32a65x/design/images/ptw_pte_flowchart.png rename to docs/design/design-manual/source/images/ptw_pte_flowchart.png diff --git a/docs/04_cv32a65x/design/images/ptw_state_diagram.png b/docs/design/design-manual/source/images/ptw_state_diagram.png similarity index 100% rename from docs/04_cv32a65x/design/images/ptw_state_diagram.png rename to docs/design/design-manual/source/images/ptw_state_diagram.png diff --git a/docs/04_cv32a65x/design/images/replacement_entry.png b/docs/design/design-manual/source/images/replacement_entry.png similarity index 100% rename from docs/04_cv32a65x/design/images/replacement_entry.png rename to docs/design/design-manual/source/images/replacement_entry.png diff --git a/docs/design/design-manual/source/images/schema_fsm_load_control.png b/docs/design/design-manual/source/images/schema_fsm_load_control.png new file mode 100644 index 0000000000..a5907072ea Binary files /dev/null and b/docs/design/design-manual/source/images/schema_fsm_load_control.png differ diff --git a/docs/04_cv32a65x/design/images/sfence_vaddr_asid.png b/docs/design/design-manual/source/images/sfence_vaddr_asid.png similarity index 100% rename from docs/04_cv32a65x/design/images/sfence_vaddr_asid.png rename to docs/design/design-manual/source/images/sfence_vaddr_asid.png diff --git a/docs/04_cv32a65x/design/images/sfence_vaddr_x0.png b/docs/design/design-manual/source/images/sfence_vaddr_x0.png similarity index 100% rename from docs/04_cv32a65x/design/images/sfence_vaddr_x0.png rename to docs/design/design-manual/source/images/sfence_vaddr_x0.png diff --git a/docs/04_cv32a65x/design/images/sfence_x0_asid.png b/docs/design/design-manual/source/images/sfence_x0_asid.png similarity index 100% rename from docs/04_cv32a65x/design/images/sfence_x0_asid.png rename to docs/design/design-manual/source/images/sfence_x0_asid.png diff --git a/docs/04_cv32a65x/design/images/sfence_x0_x0.png b/docs/design/design-manual/source/images/sfence_x0_x0.png similarity index 100% rename from docs/04_cv32a65x/design/images/sfence_x0_x0.png rename to docs/design/design-manual/source/images/sfence_x0_x0.png diff --git a/docs/04_cv32a65x/design/images/shared_tlb.png b/docs/design/design-manual/source/images/shared_tlb.png similarity index 100% rename from docs/04_cv32a65x/design/images/shared_tlb.png rename to docs/design/design-manual/source/images/shared_tlb.png diff --git a/docs/04_cv32a65x/design/images/shared_tlb_in_out.png b/docs/design/design-manual/source/images/shared_tlb_in_out.png similarity index 100% rename from docs/04_cv32a65x/design/images/shared_tlb_in_out.png rename to docs/design/design-manual/source/images/shared_tlb_in_out.png diff --git a/docs/04_cv32a65x/design/images/shared_tlb_set.png b/docs/design/design-manual/source/images/shared_tlb_set.png similarity index 100% rename from docs/04_cv32a65x/design/images/shared_tlb_set.png rename to docs/design/design-manual/source/images/shared_tlb_set.png diff --git a/docs/04_cv32a65x/design/images/subsystems.png b/docs/design/design-manual/source/images/subsystems.png similarity index 100% rename from docs/04_cv32a65x/design/images/subsystems.png rename to docs/design/design-manual/source/images/subsystems.png diff --git a/docs/04_cv32a65x/design/images/update_tree.png b/docs/design/design-manual/source/images/update_tree.png similarity index 100% rename from docs/04_cv32a65x/design/images/update_tree.png rename to docs/design/design-manual/source/images/update_tree.png diff --git a/docs/design/design-manual/source/instructions.adoc b/docs/design/design-manual/source/instructions.adoc new file mode 100644 index 0000000000..d7e0e5ba96 --- /dev/null +++ b/docs/design/design-manual/source/instructions.adoc @@ -0,0 +1,19 @@ +//// + Copyright 2023 Thales DIS France SAS + Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0 + You may obtain a copy of the License at https://solderpad.org/licenses/ + + Original Author: Jean-Roch COULON - Thales +//// + +[[instructions]] +Instructions +~~~~~~~~~~~~ + +The next subchapter lists the extensions implemented in {ohg-config}. By +configuration, we can enable/disable the extensions. {ohg-config} supports +the extensions described in the next subchapters. + +include::isa/isa.adoc[] diff --git a/docs/design/design-manual/source/intro.adoc b/docs/design/design-manual/source/intro.adoc new file mode 100644 index 0000000000..ba3237458c --- /dev/null +++ b/docs/design/design-manual/source/intro.adoc @@ -0,0 +1,120 @@ +//// + Copyright 2022 Thales DIS design services SAS + Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0 + You may obtain a copy of the License at https://solderpad.org/licenses/ + + Original Author: Jean-Roch COULON - Thales +//// + +[[introduction]] +Introduction +------------ + +The OpenHW Group uses https://semver.org/[semantic versioning] to +describe the release status of its IP. This document describes the +{ohg-config} configuration version of CVA6. This intends to be the first +formal release of CVA6. + +CVA6 is a 6-stage in-order and single issue processor core which +implements the RISC-V instruction set. CVA6 can be configured as a 32- +or 64-bit core (RV32 or RV64), called CV32A6 or CV64A6. + +The objective of this document is to provide enough information to allow +the RTL modification (by designers) and the RTL verification (by +verificators). This document is not dedicated to CVA6 users looking for +information to develop software like instructions or registers. + +The CVA6 architecture is illustrated in the following figure. + +image:ariane_overview.drawio.png[CVA6 Architecture] + +[[license]] +License +~~~~~~~ + +[verse] +-- +Copyright 2022 Thales +Copyright 2018 ETH Zürich and University of Bologna +SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 +Licensed under the Solderpad Hardware License v 2.1 (the “License”); you may not use this file except in compliance with the License, or, at your option, the Apache License version 2.0. You may obtain a copy of the License at https://solderpad.org/licenses/SHL-2.1/. +Unless required by applicable law or agreed to in writing, any work distributed under the License is distributed on an “AS IS” BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. +-- + +[[standards-compliance]] +Standards Compliance +~~~~~~~~~~~~~~~~~~~~ + +To ease the reading, the reference to these specifications can be +implicit in the requirements below. For the sake of precision, the +requirements identify the versions of RISC-V extensions from these +specifications. + +* *[CVA6req]* “CVA6 requirement specification”, +https://github.com/openhwgroup/cva6/blob/master/docs/specifications/cva6_requirement_specification.rst, +HASH#767c465. +* *[RVunpriv]* “The RISC-V Instruction Set Manual, Volume I: User-Level +ISA, Document Version 20191213”, Editors Andrew Waterman and Krste +Asanović, RISC-V Foundation, December 13, 2019. +* *[RVpriv]* “The RISC-V Instruction Set Manual, Volume II: Privileged +Architecture, Document Version 20211203”, Editors Andrew Waterman, Krste +Asanović and John Hauser, RISC-V Foundation, December 4, 2021. +* *[RVdbg]* “RISC-V External Debug Support, Document Version 0.13.2”, +Editors Tim Newsome and Megan Wachs, RISC-V Foundation, March 22, 2019. +* *[RVcompat]* “RISC-V Architectural Compatibility Test Framework”, +https://github.com/riscv-non-isa/riscv-arch-test. +* *[AXI]* AXI Specification, +https://developer.arm.com/documentation/ihi0022/hc. +* *[CV-X-IF]* Placeholder for the CV-X-IF coprocessor interface +currently prepared at OpenHW Group; current version in +https://docs.openhwgroup.org/projects/openhw-group-core-v-xif/. +* *[OpenPiton]* “OpenPiton Microarchitecture Specification”, Princeton +University, +https://parallel.princeton.edu/openpiton/docs/micro_arch.pdf. + +{ohg-config} is a standards-compliant {XLEN}-bit processor fully compliant with +RISC-V specifications: [RVunpriv], [RVpriv] and [RVdbg] and passes +[RVcompat] compatibility tests, as requested by [GEN-10] in [CVA6req]. + +[[documentation-framework]] +Documentation framework +~~~~~~~~~~~~~~~~~~~~~~~ + +The framework of this document is inspired by the Common Criteria. The +Common Criteria for Information Technology Security Evaluation (referred +to as Common Criteria or CC) is an international standard (ISO/IEC +15408) for computer security certification. + +Description of the framework: + +* Processor is split into module corresponding to the main modules of +the design +* Modules can contain several modules +* Each module is described in a chapter, which contains the following +subchapters: _Description_, _Functionalities_, _Architecture and +Modules_ and _Registers_ (if any) +* The subchapter _Description_ describes the main features of the +submodule, the interconnections between the current module and the +others and the inputs/outputs interface. +* The subchapter _Functionality_ lists in details the module +functionalities. Please avoid using the RTL signal names to explain the +functionalities. +* The subchapter _Architecture and Modules_ provides a drawing to +present the module hierarchy, then the functionalities covered by the +module +* The subchapter _Registers_ specifies the module registers if any + +[[contributors]] +Contributors +~~~~~~~~~~~~ + +[verse] +-- +Jean-Roch Coulon - Thales +Ayoub Jalali (mailto:ayoub.jalali@external.thalesgroup.com[ayoub.jalali@external.thalesgroup.com]) +Alae Eddine Ezzejjari (mailto:alae-eddine.ez-zejjari@external.thalesgroup.com[alae-eddine.ez-zejjari@external.thalesgroup.com]) +-- + +*[TO BE COMPLETED]* diff --git a/docs/design/design-manual/source/mmu.adoc b/docs/design/design-manual/source/mmu.adoc new file mode 100644 index 0000000000..d41f6859d2 --- /dev/null +++ b/docs/design/design-manual/source/mmu.adoc @@ -0,0 +1,1258 @@ +[[CVA6_MMU]] +[[memory-management-unit]] +Memory Management Unit +---------------------- + +The Memory Management Unit (MMU) SV32 module is a crucial component in +the RISC-V-based processor, serving as the backbone for virtual memory +management and address translation. + +image:mmu_in_out.png[*Figure 1:* Inputs and Outputs of CVA6 +MMU SV32,scaledwidth=70.0%] + +At its core, the MMU SV32 plays a pivotal role in translating virtual +addresses into their corresponding physical counterparts. This +translation process is paramount for providing memory protection, +isolation, and efficient memory management in modern computer systems. +Importantly, it handles both instruction and data accesses, ensuring a +seamless interaction between the processor and virtual memory. Within +the MMU, several major blocks play pivotal roles in this address +translation process. These includes: + +* Instruction TLB (ITLB) +* Data TLB (DTLB) +* Shared TLB +* Page Table Walker (PTW) + +image:mmu_major_blocks.png[*Figure 2:* Major Blocks in CVA6 +MMU SV32,scaledwidth=60.0%] + +The MMU SV32 manages privilege levels and access control, enforcing +permissions for user and supervisor modes while handling access +exceptions. It employs Translation Lookaside Buffers (TLBs) for +efficient address translation, reducing the need for page table access. +TLB hits yield quick translations, but on misses, the shared TLB is +consulted, and if necessary, the Page Table Walker (PTW) performs page +table walks, updating TLBs and managing exceptions during the process. + +In addition to these functionalities, the MMU SV32 seamlessly integrates +support for Physical Memory Protection (PMP), enabling it to enforce +access permissions and memory protection configurations as specified by +the PMP settings. This additional layer of security and control enhances +the management of memory accesses + +The MMU SV32 maintains interfaces with the instruction cache (ICache) +and the load-store unit (LSU). It receives virtual addresses from these +components and proceeds to translate them into physical addresses, a +fundamental task for ensuring proper program execution and memory +access. + +[cols=",,,,",options="header",] +|======================================================================= +|Signal |IO |Connection Type |Type |Description +|`clk_i` |in |Subsystem |logic |Subsystem Clock + +|`rst_ni` |in |Subsystem |logic |Asynchronous reset active low + +|`flush_i` |in |Controller |logic |Sfence Committed + +|`enable_translation_i` |in |CSR RegFile |logic |Indicate address +translation request for instruction + +|`en_ld_st_translation_i` |in |CSR RegFile |logic |Indicate address +translation request for load or store + +|`icache_areq_i` |in |Cache Subsystem |icache_arsp_t |Icache Response + +|`icache_areq_o` |out |Cache Subsystem |icache_areq_t |Icache Request + +|`misaligned_ex_i` |in |Load Store Unit |exception_t |Indicate +misaligned exception + +|`lsu_req_i` |in |Load Store Unit |logic |Request address translation + +|`lsu_vaddr_i` |in |Load Store Unit |logic [riscv::VLEN-1:0] |Virtual +Address In + +|`lsu_is_store_i` |in |Store Unit |logic |Translation is requested by a +store + +|`lsu_dtlb_hit_o` |out |Store / Load Unit |logic |Indicate a DTLB hit + +|`lsu_dtlb_ppn_o` |out |Load Unit |logic [riscv::PPNW-1:0] |Send PNN to +LSU + +|`lsu_valid_o` |out |Load Store Unit |logic |Indicate a valid +translation + +|`lsu_paddr_o` |out |Store / Load Unit |logic [riscv::PLEN-1:0] +|Translated Address + +|`lsu_exception_o` |out |Store / Load Unit |exception_t |Address +Translation threw an exception + +|`priv_lvl_i` |in |CSR RegFile |riscv::priv_lvl_t |Privilege level for +instruction fetch interface + +|`ld_st_priv_lvl_i` |in |CSR RegFile |riscv::priv_lvl_t |Privilege Level +for Data Interface + +|`sum_i` |in |CSR RegFile |logic |Supervisor User Memory Access bit in +xSTATUS CSR register + +|`mxr_i` |in |CSR RegFile |logic |Make Executable Readable bit in +xSTATUS CSR register + +|`satp_ppn_I` |in |CSR RegFile |logic [riscv::PPNW-1:0] |PPN of top +level page table from SATP register + +|`asid_i` |in |CSR RegFile |logic [ASID_WIDTH-1:0] |ASID to for the +lookup + +|`asid_to_be_flushed` |in |Execute Stage |logic [ASID_WIDTH-1:0] |ASID +of the entry to be flushed. + +|`vaddr_to_be_flushed_i` |in |Execute Stage |logic [riscv::VLEN-1:0] +|Virtual address of the entry to be flushed. + +|`flush_tlb_i` |in |Controller |logic |SFENCE.VMA committed + +|`itlb_miss_o` |out |Performance Counter |logic |Indicate an ITLB miss + +|`dtlb_miss_o` |out |Performance Counter |logic |Indicate a DTLB miss + +|`req_port_i` |in |Cache Subsystem |dcache_req_o_t |D Cache Data +Requests + +|`req_port_o` |out |Cache Subsystem |dcache_req_i_t |D Cache Data +Response + +|`pmpcfg_i` |in |CSR RegFile |riscv::pmpcfg_t [15:0] |PMP configurations + +|`pmpaddr_i` |in |CSR RegFile |logic [15:0][riscv::PLEN-3:0] |PMP +Address +|======================================================================= + +[cols=",,",options="header",] +|=============================================================== +|Signal |Type |Description +|`fetch_valid` |logic |Address Translation Valid +|`fetch_paddr` |logic [riscv::PLEN-1:0] |Physical Address In +|`fetch_exception` |exception_t |Exception occurred during fetch +|=============================================================== + +[cols=",,",options="header",] +|=========================================================== +|Signal |Type |Description +|`fetch_req` |logic |Address Translation Request +|`fetch_vaddr` |logic [riscv::VLEN-1:0] |Virtual Address out +|=========================================================== + +[cols=",,",options="header",] +|======================================================================= +|Signal |Type |Description +|`cause` |riscv::xlen_t |Cause of exception + +|`tval` |riscv::xlen_t |Additional information of causing exception +(e.g. instruction causing it), address of LD/ST fault + +|`valid` |logic |Indicate that exception is valid +|======================================================================= + +[cols=",,",options="header",] +|==================================================================== +|Signal |Type |Description +|`locked` |logic |Lock this configuration +|`reserved` |logic[1:0] |Reserved bits in pmpcfg CSR +|`addr_mode` |pmp_addr_mode_t |Addressing Modes: OFF, TOR, NA4, NAPOT +|`access_type` |pmpcfg_access_t |None, read, write, execute +|==================================================================== + +image:mmu_control_flow.png[*Figure 3:* Control Flow in CVA6 +MMU SV32,scaledwidth=95.0%] + +Two potential exception sources exist: + +* Hardware Page Table Walker (HPTW) throwing an exception, signifying a +page fault exception. +* Access error due to insufficient permissions of PMP, known as an +access exception. + +The IF stage initiates a request to retrieve memory content at a +specific virtual address. When the MMU is disabled, the instruction +fetch request is directly passed to the I$ without modifications. + +If virtual memory translation is enabled for instruction fetches, the +following operations are performed in the instruction interface: + +* Compatibility of requested virtual address with selected page based +address translation scheme is checked. +* For 4K page translation, the module determines the fetch physical +address by combining the physical page number (PPN) from ITLB content +and the offset from the virtual address. +* In the case of Mega page translation, if the ITLB indicates a 4M page, +the VPN0 from the fetch virtual address is written to the PPN0 of the +fetch physical address to ensure alignment for superpage translation. +* If the Instruction TLB (ITLB) lookup hits, the fetch valid signal +(which indicates a valid physical address) is activated in response to +the input fetch request. Memory region accessibility is checked from the +perspective of the fetch operation, potentially triggering a page fault +exception in case of an access error or insufficient PMP permission. +* In case of an ITLB miss, if the page table walker (PTW) is active +(only active if there is a shared TLB miss) and handling instruction +fetches, the fetch valid signal is determined based on PTW errors or +access exceptions. + +If the fetch physical address doesn't match any execute region, an +Instruction Access Fault is raised. When not translating, PMPs are +immediately checked against the physical address for access +verification. + +If address translation is enabled for load or store, and no misaligned +exception has occurred, the following operations are performed in the +data interface: + +* Initially, translation is assumed to be invalid, signified by the MMU +to LSU. +* The translated physical address is formed by combining the PPN from +the Page Table Entry (PTE) and the offset from the virtual address +requiring translation. This send one cycle later due to the additional +bank of registers which delayed the MMU’s answer. The PPN from the PTE +is also shared separately with LSU in the same cycle as the hit. +* In the case of superpage translation, as in SV32, known as the 4M +page, PPN0 of the translated physical address and the separately shared +PPN are updated with the VPN0 of the virtual address. + +If a Data TLB (DTLB) hit occurs, it indicates a valid translation, and +various fault checks are performed depending on whether it's a load or +store request. + +* For store requests, if the page is not writable, the dirty flag isn't +set, or privileges are violated, it results in a page fault +corresponding to the store access. If PMPs are also violated, it leads +to an access fault corresponding to the store access. Page faults take +precedence over access faults. +* For load requests, a page fault is triggered if there are insufficient +access privileges. PMPs are checked again during load access, resulting +in an access fault corresponding to load access if PMPs are violated. + +In case of a DTLB miss, potential exceptions are monitored during the +page table walk. If the PTW indicates a page fault, the corresponding +page fault related to the requested type is signaled. If the PTW +indicates an access exception, the load access fault is indicated +through address translation because the page table walker can only throw +load access faults. + +When address translation is not enabled, the physical address is +immediately checked against Physical Memory Protections (PMPs). If there +is a request from LSU, no misaligned exception, and PMPs are violated, +it results in an access fault corresponding to the request being +indicated. + +[[translation-lookaside-buffer]] +Translation Lookaside Buffer +---------------------------- + +Page tables are accessed for translating virtual memory addresses to +physical memory addresses. This translation needs to be carried out for +every load and store instruction and also for every instruction fetch. +Since page tables are resident in physical memory, accessing these +tables in all these situations has a significant impact on performance. +Page table accesses occur in patterns that are closely related in time. +Furthermore, the spatial and temporal locality of data accesses or +instruction fetches mean that the same page is referenced repeatedly. +Taking advantage of these access patterns the processor keeps the +information of recent address translations, to enable fast retrieval, in +a small cache called the Translation Lookaside Buffer (TLB) or an +address-translation cache. + +The CVA6 TLB is structured as a fully associative cache, where the +virtual address that needs to be translated is compared against all the +individual TLB entries. Given a virtual address, the processor examines +the TLB (TLB lookup) to determine if the virtual page number (VPN) of +the page being accessed is in the TLB. When a TLB entry is found (TLB +hit), the TLB returns the corresponding physical page number (PPN) which +is used to calculate the target physical address. If no TLB entry is +found (TLB miss) the processor has to read individual page table entries +from memory (Table walk). In CVA6 table walking is supported by +dedicated hardware. Once the processor finishes the table walk it has +the Physical Page Number (PPN) corresponding to the Virtual Page Number +(VPN) That needs to be translated. The processor adds an entry for this +address translation to the TLB so future translations of that virtual +address will happen quickly through the TLB. During the table walk the +processor may find out that the corresponding physical page is not +resident in memory. At this stage a page table exception (Page Fault) is +generated which gets handled by the operating system. The operating +system places the appropriate page in memory, updates the appropriate +page tables and returns execution to the instruction which generated the +exception. + +The inputs and output signals of the TLB are shown in the following two +figures. + +image:in_out_tlb.png[*Figure 4:* Inputs and Outputs of CVA6 +TLB,scaledwidth=65.0%] + +[cols=",,,,",options="header",] +|======================================================================= +|Signal |IO |connection |Type |Description +|`clk_i` |in |SUBSYSTEM |logic |Subsystem Clock + +|`rst_ni` |in |SUBSYSTEM |logic |Asynchronous reset active low + +|`flush_i` |in |Controller |logic |Asynchronous reset active low + +|`update_i` |in |Shared TLB |tlb_update_sv32_t |Updated tag and content +of TLB + +|`lu_access_i` |in |Cache Subsystem |logic |Signal indicating a lookup +access is being requested + +|`lu_asid_i` |in |CSR RegFile |logic[ASID_WIDTH-1:0] |ASID (Address +Space Identifier) for the lookup + +|`lu_vaddr_i` |in |Cache Subsystem |logic[riscv::VLEN-1:0] |Virtual +address for the lookup + +|`lu_content_o` |out |MMU SV32 |riscv::pte_sv32_t |Output for the +content of the TLB entry + +|`asid_to_be_flushed_i` |in |Execute Stage |logic[ASID_WIDTH-1:0] |ASID +of the entry to be flushed + +|`vaddr_to_be_flushed_i` |in |Execute Stage |logic[riscv::VLEN-1:0] +|Virtual address of the entry to be flushed + +|`lu_is_4M_o` |out |MMU SV32 |logic |Output indicating whether the TLB +entry corresponds to a 4MB page + +|`lu_hit_o` |out |MMU SV32 |logic |Output indicating whether the lookup +resulted in a hit or miss +|======================================================================= + +[cols=",,",options="header",] +|======================================================================= +|Signal |Type |Description +|`valid` |logic |Indicates whether the TLB update entry is valid or not + +|`is_4M` |logic |Indicates if the TLB entry corresponds to a 4MB page + +|`vpn` |logic[19:0] |Virtual Page Number (VPN) used for updating the +TLB, consisting of 20 bits + +|`asid` |logic[8:0] |Address Space Identifier (ASID) used for updating +the TLB, with a length of 9 bits for Sv32 MMU + +|`content` |riscv::pte_sv32_t |Content of the TLB update entry, defined +by the structure +|======================================================================= + +[cols=",,",options="header",] +|======================================================================= +|Signal |Type |Description +|`ppn` |logic[21:0] |22 bit Physical Page Number (PPN) + +|`rsw` |logic[1:0] |Reserved for use by supervisor software + +|`d` |logic a| +[verse] +-- +Dirty bit indicating whether the page has been modified (dirty) or not +0: Page is clean i.e., has not been written +1: Page is dirty i.e., has been written +-- + +|`a` |logic a| +[verse] +-- +Accessed bit indicating whether the page has been accessed +0: Virtual page has not been accessed since the last time A bit was cleared +1: Virtual page has been read, written, or fetched from since the last time the A bit was cleared +-- + +|`g` |logic a| +[verse] +-- +Global bit marking a page as part of a global address space valid for all ASIDs +0: Translation is valid for specific ASID +1: Translation is valid for all ASIDs +-- + +|`u` |logic a| +[verse] +-- +User bit indicating privilege level of the page +0: Page is not accessible in user mode but in supervisor mode +1: Page is accessible in user mode but not in supervisor mode +-- + +|`x` |logic a| +[verse] +-- +Execute bit which allows execution of code from the page +0: Code execution is not allowed +1: Code execution is permitted +-- + +|`w` |logic a| +[verse] +-- +Write bit allows the page to be written +0: Write operations are not allowed +1: Write operations are permitted +-- + +|`r` |logic a| +[verse] +-- +Read bit allows read access to the page +0: Read operations are not allowed +1: Read operations are permitted +-- + +|`v` |logic a| +[verse] +-- +Valid bit indicating the page table entry is valid +0: Page is invalid i.e. page is not in DRAM, translation is not valid +1: Page is valid i.e. page resides in the DRAM, translation is valid +-- + +|======================================================================= + +The number of TLB entries can be changed via a design parameter. In +32-bit configurations of CVA6 only 2 TLB entries are instantiated. Each +TLB entry is made up of two fields: Tag and Content. The Tag field holds +the virtual page number (VPN1, VPN0), ASID, page size (is_4M) along with +a valid bit (VALID) indicating that the entry is valid. The SV32 virtual +page number, which is supported by CV32A6X, is further split into two +separate virtual page numbers VPN1 and VPN0. The Content field contains +two physical page numbers (PPN1, PPN0) along with a number of bits which +specify various attributes of the physical page. Note that the V bit in +the Content field is the V bit which is present in the page table in +memory. It is copied from the page table, as is, and the VALID bit in +the Tag is set based on its value.The TLB entry fields are shown in +*Figure 2*. + +image:cva6_tlb_entry.png[*Figure 5:* Fields in CVA6 TLB +entry,scaledwidth=80.0%] + +The CVA6 TLB implements the following three functions: + +* *Translation:* This function implements the address lookup and match +logic. +* *Update and Flush:* This function implements the update and flush +logic. +* *Pseudo Least Recently Used Replacement Policy:* This function +implements the replacement policy for TLB entries. + +This function takes in the virtual address and certain other fields, +examines the TLB to determine if the virtual page number of the page +being accessed is in the TLB or not. If a TLB entry is found (TLB hit), +the TLB returns the corresponding physical page number (PPN) which is +then used to calculate the target physical address. The following checks +are done as part of this lookup function to find a match in the TLB: + +* *Validity Check:* For a TLB hit, the associated TLB entry must be +valid . +* *ASID and Global Flag Check:* The TLB entry's ASID must match the +given ASID (ASID associated with the Virtual address). If the TLB +entry’s Global bit (G) bit is set then this check is not done. This +ensures that the translation is either specific to the provided ASID or +it is globally applicable. +* *Level 1 VPN match:* SV32 implements a two-level page table. As such +the virtual address is broken up into three parts which are the virtual +page number 1, virtual page number 0 and displacement. So the condition +that is checked next is that the virtual page number 1 of the virtual +address matches the virtual page number 1(VPN1) of the TLB entry. +* *Level 0 VPN match or 4-Mega Page:* The last condition to be checked, +for a TLB hit, is that the virtual page number 0 of the virtual address +matches the virtual page number 0 of the TLB entry (VPN0). This match is +ignored if the is_4M bit in the Tag is set which implies a super 4M +page. + +All the conditions listed above are checked against every TLB entry. If +there is a TLB hit then the corresponding bit in the hit array is set. +*Figure 3* Illustrates the TLB hit/miss process listed above. + +image:cva6_tlb_hit.png[*Figure 6:* Block diagram of CVA6 TLB +hit or miss,scaledwidth=75.0%] + +The SFENCE.VMA instruction can be used with certain specific source +register specifiers (rs1 & rs2) to flush a specific TLB entry, some set +of TLB entries or all TLB entries. Like all instructions this action +only takes place when the SFENCE.VMA instruction is committed (shown via +the commit_sfence signal in the following figures.) The behavior of the +instruction is as follows: + +* *If rs1 is not equal to x0 and rs2 is not equal to x0:* Invalidate all +TLB entries which contain leaf page table entries corresponding to the +virtual address in rs1 (shown below as Virtual Address to be flushed) +and that match the address space identifier as specified by integer +register rs2 (shown below as asid_to_be_flushed_i), except for entries +containing global mappings. This is referred to as the “SFENCE.VMA vaddr +asid” case. + +image:sfence_vaddr_asid.png[*Figure 7:* Invalidate TLB entry +if ASID and virtual address match,scaledwidth=75.0%] + +* *If rs1 is equal to x0 and rs2 is equal to x0:* Invalidate all TLB +entries for all address spaces. This is referred to as the "SFENCE.VMA +x0 x0" case. + +image:sfence_x0_x0.png[*Figure 8:* Invalidate all TLB entries +if both source register specifiers are x0,scaledwidth=62.0%] + +* *If rs1 is not equal to x0 and rs2 is equal to x0:* invalidate all TLB +entries that contain leaf page table entries corresponding to the +virtual address in rs1, for all address spaces. This is referred to as +the “SFENCE.VMA vaddr x0” case. + +image:sfence_vaddr_x0.png[*Figure 9:* Invalidate TLB entry +with matching virtual address for all address spaces,scaledwidth=75.0%] + +* *If rs1 is equal to x0 and rs2 is not equal to x0:* Invalidate all TLB +entries matching the address space identified by integer register rs2, +except for entries containing global mappings. This is referred to as +the “SFENCE.VMA 0 asid” case. + +image:sfence_x0_asid.png[*Figure 10:* Invalidate TLB entry for +matching ASIDs,scaledwidth=75.0%] + +When a TLB valid update request is signaled by the shared TLB, and the +replacement policy select the update of a specific TLB entry, the +corresponding entry's tag is updated with the new tag, and its +associated content is refreshed with the information from the update +request. This ensures that the TLB entry accurately reflects the new +translation information. + +Cache replacement algorithms are used to determine which TLB entry +should be replaced, because it is not likely to be used in the near +future. The Pseudo-Least-Recently-Used (PLRU) is a cache entry +replacement algorithm, derived from Least-Recently-Used (LRU) cache +entry replacement algorithm, used by the TLB. Instead of precisely +tracking recent usage as the LRU algorithm does, PLRU employs an +approximate measure to determine which entry in the cache has not been +recently used and as such can be replaced. + +CVA6 implements the PLRU algorithm via the Tree-PLRU method which +implements a binary tree. The TLB entries are the leaf nodes of the +tree. Each internal node, of the tree, consists of a single bit, +referred to as the state bit or plru bit, indicating which subtree +contains the (pseudo) least recently used entry (the PLRU); 0 for the +left hand tree and 1 for the right hand tree. Following this traversal, +the leaf node reached, corresponds to the PLRU entry which can be +replaced. Having accessed an entry (so as to replace it) we need to +promote that entry to be the Most Recently Used (MRU) entry. This is +done by updating the value of each node along the access path to point +away from that entry. If the accessed entry is a right child i.e., its +parent node value is 1, it is set to 0, and if the parent is the left +child of its parent (the grandparent of the accessed node) then its node +value is set to 1 and so on all the way up to the root node. + +The PLRU binary tree is implemented as an array of node values. Nodes +are organized in the array based on levels, with those from lower levels +appearing before higher ones. Furthermore those on the left side of a +node appear before those on the right side of a node. The figure below +shows a tree and the corresponding array. + +image:plru_tree_indexing.png[*Figure 11:* PLRU Tree +Indexing,scaledwidth=60.0%] + +For n-way associative, we require n - 1 internal nodes in the tree. With +those nodes, two operations need to be performed efficiently. + +* Promote the accessed entry to be MRU +* Identify which entry to replace (i.e. the PLRU entry) + +For a TLB entry which is accessed, the following steps are taken to make +it the MRU: + +1. Iterate through each level of the binary tree. +2. Calculate the index of the leftmost child within the current level. +Let us call that index the index base. +3. Calculate the shift amount to identify the relevant node based on +the level and TLB entry index. +4. Calculate the new value that the node should have in order to make +the accessed entry the Most Recently Used (MRU). The new value of the +root node is the opposite of the TLB entry index, MSB at the root node, +MSB - 1 at node at next level and so on. +5. Assign this new value to the relevant node, ensuring that the hit +entry becomes the MRU within the binary tree structure. + +At level 0, no bit of the TLB entry’s index determines the offset from +the index base because it’s a root node. At level 1, MSB of entry’s +index determines the amount of offset from index base at that level. At +level 2, the first two bits of the entry's index from MSB side determine +the offset from the index base because there are 4 nodes at the level 2 +and so on. + +image:update_tree.png[*Figure 12:* Promote Entry to be +MRU,scaledwidth=82.0%] + +In the above figure entry at index 5, is accessed. To make it MRU entry, +every node along the access path should point away from it. Entry 5 is a +right child, therefore, its parent plru bit set to 0, its parent is a +left child, its grand parent’s plru bit set to 1, and great +grandparent’s plru bit set to 0. + +Every TLB entry is checked for the replacement entry. The following +steps are taken: + +1. Iterate through each level of the binary tree. +2. Calculate the index of the leftmost child within the current level. +Let us call that index the index base. +3. Calculate the shift amount to identify the relevant node based on +the level and TLB entry index. +4. If the corresponding bit of the entry's index matches the value of +the node being traversed at the current level, keep the replacement +signal high for that entry; otherwise, set the replacement signal to +low. + +image:replacement_entry.png[*Figure 13:* Possible path +traverse for entry selection for replacement,scaledwidth=65.0%] + +Figure shows every possible path that traverses to find out the PLRU +entry. If the plru bit at each level matches with the corresponding bit +of the entry's index, that’s the next entry to replace. Below Table +shows the entry selection for replacement. + +[width="81%",cols="35%,27%,38%",] +|================================================ +|*Path Traverse* |*PLRU Bits* |*Entry to replace* +a| +0 -> 1 -> 3:: + * + + a| +___ +000 +___ + +---------------+:: + 001 + + a| +_ +0 +_ + +----------------------+:: + 1 + +a| +0 -> 1 -> 4:: + * + + a| +___ +010 +___ + +---------------+:: + 011 + + a| +_ +2 +_ + +----------------------+:: + 3 + +a| +0 -> 2 -> 5:: + * + + a| +___ +100 +___ + +---------------+:: + 101 + + a| +_ +4 +_ + +----------------------+:: + 5 + +a| +0 -> 2 -> 6:: + * + + a| +___ +110 +___ + +---------------+:: + 111 + + a| +_ +6 +_ + +----------------------+:: + 7 + +|================================================ + +[[shared-translation-lookaside-buffer]] +Shared Translation Lookaside Buffer +----------------------------------- + +The CVA6 shared TLB is structured as a 2-way associative cache, where +the virtual address requiring translation is compared with the set +indicated by the virtual page number. The shared TLB is looked up in +case of an Instruction TLB (ITLB) or data TLB (DTLB) miss, signaled by +these TLBs. If the entry is found in the shared TLB set, the respective +TLB, whose translation is being requested, is updated. If the entry is +not found in the shared TLB, then the processor has to perform a page +table walk. Once the processor obtains a PPN corresponding to the VPN, +the shared TLB is updated with this information. If the physical page is +not found in the page table, it results in a page fault, which is +handled by the operating system. The operating system will then place +the corresponding physical page in memory. + +The inputs and output signals of the shared TLB are shown in the +following two figures. + +image:shared_tlb_in_out.png[*Figure 14:* Inputs and outputs of +CVA6 shared TLB,scaledwidth=60.0%] + +[cols=",,,,",options="header",] +|======================================================================= +|Signal |IO |Connection |Type |Description +|`clk_i` |in |Subsystem |logic |Subsystem Clock + +|`rst_ni` |in |Subsystem |logic |Asynchronous reset active low + +|`flush_i` |in |Controller |logic |TLB flush request + +|`enable_translation_i` |in |CSR Regfile |logic |CSRs indicate to enable +Sv32 + +|`en_ld_st_translation_i` |in |CSR Regfile |logic |Enable virtual memory +translation for load/stores + +|`asid_i` |in |CSR Regfile |logic |ASID for the lookup + +|`itlb_access_i` |in |Cache Subsystem |logic |Signal indicating a lookup +access in ITLB is being requested. + +|`itlb_hit_i` |in |ITLB |logic |Signal indicating an ITLB hit + +|`itlb_vaddr_i` |in |Cache Subsystem |logic[31:0] |Virtual address +lookup in ITLB + +|`dtlb_access_i` |in |Load/Store Unit |logic |Signal indicating a lookup +access in DTLB is being requested. + +|`dtlb_hit_i` |in |DTLB |logic |Signal indicating a DTLB hit + +|`dtlb_vaddr_i` |in |Load/Store Unit |logic[31:0] |Virtual address +lookup in DTLB + +|`itlb_update_o` |out |ITLB |tlb_update_sv32_t |Tag and content to +update ITLB + +|`dtlb_update_o` |out |DTLB |tlb_update_sv32_t |Tag and content to +update DTLB + +|`itlb_miss_o` |out |Performance Counter |logic |Signal indicating an +ITLB miss + +|`dtlb_miss_o` |out |Performance Counter |logic |Signal indicating a +DTLB miss + +|`shared_tlb_access_o` |out |PTW |logic |Signal indicating a lookup +access in shared TLB is being requested + +|`shared_tlb_hit_o` |out |PTW |logic |Signal indicating a shared TLB hit + +|`shared_tlb_vadd_o` |out |PTW |logic[31:0] |Virtual address lookup in +shared TLB + +|`itlb_req_o` |out |PTW |logic |ITLB Request Output + +|`shared_tlb_update_i` |in |PTW |tlb_update_sv32_t |Updated tag and +content of shared TLB +|======================================================================= + +[cols=",,",options="header",] +|======================================================================= +|Signal |Type |Description +|`is_4M` |logic |Indicates if the shared TLB entry corresponds to a 4MB +page. + +|`vpn1` |logic[9:0] |Virtual Page Number (VPN) represents the index of +PTE in the page table level 1. + +|`vpn0` |logic[9:0] |Virtual Page Number (VPN) represents the index of +PTE in the page table level 0. + +|`asid` |logic |Address Space Identifier (ASID) used to identify +different address spaces +|======================================================================= + +Shared TLB is 2-way associative, with a depth of 64. A single entry in +the set contains the valid bit, tag and the content. The Tag segment +stores details such as the virtual page number (VPN1, VPN0), ASID, and +page size (is_4M). The Content field contains two physical page numbers +(PPN1, PPN0) along with a number of bits which specify various +attributes of the physical page. + +image:shared_tlb.png[*Figure 15:* CVA6 Shared TLB +Structure,scaledwidth=60.0%] + +The implementation of a shared TLB in CVA6 is described in the following +sections: + +* *ITLB and DTLB Miss:* Prepare a shared TLB lookup if the entry is not +found in ITLB or DTLB. +* *Tag Comparison:* Look up the provided virtual address in the shared +TLB. +* *Update and Flush:* Flush the shared TLB or update it. +* *Replacement Policies:* First non-valid entry and random replacement +policy. + +Consider a scenario where an entry is found in the ITLB or DTLB. In this +case, there is no need to perform a lookup in the shared TLB since the +entry has already been found. Next, there are two scenarios: an ITLB +miss or a DTLB miss. + +To identify an ITLB miss, the following conditions need to be fulfilled: + +* Address translation must be enabled. +* There must be an access request to the ITLB. +* The ITLB should indicate an ITLB miss. +* There should be no access request to the DTLB. + +During an ITLB miss, access is granted to read the tag and content of +the shared TLB from their respective sram. The address for reading the +tag and content of the shared TLB entry is calculated using the virtual +address for which translation is not found in the ITLB. The ITLB miss is +also explicitly indicated by the shared TLB. A request for shared TLB +access is initiated. + +To identify the DTLB miss, the following conditions need to be +fulfilled: + +* Address translation for load and stores must be enabled. +* There must be an access request to the DTLB. +* The DTLB should indicate a DTLB miss. + +In the case of a DTLB miss, the same logic is employed as described for +an ITLB miss. + +Shared TLB lookup for a hit occurs under the same conditions as +described for the TLB modules used as ITLB and DTLB. However, there are +some distinctions. In both the ITLB and DTLB, the virtual address +requiring translation is compared against all TLB entries. In contrast, +the shared TLB only compares the tag and content of the set indicated by +the provided virtual page number. The index of the set is extracted from +VPN0 of the requested virtual address. Given that the shared TLB is +2-way associative, each set contains two entries. Consequently, both of +these entries are compared. Below figure illustrates how the set is +opted for the lookup. + +image:shared_tlb_set.png[*Figure 16:* Set opted for lookup in +shared TLB,scaledwidth=60.0%] + +Differing from the ITLB and DTLB, a specific virtual address or +addressing space cannot be flushed in the shared TLB. When SFENCE.VMA is +committed, all entries in the shared TLB are invalidated. (Cases of +SFENCE.VMA should also be added in shared TLB) + +When the Page Table Walker signals a valid update request, the shared +TLB is updated by selecting an entry through the replacement policy and +marking it as valid. This also triggers the writing of the new tag and +content to the respective SRAM. + +In CVA6's shared TLB, two replacement policies are employed for +replacements based on a specific condition. These replacement policies +select the entry within the set indicated by the virtual page number. +The two policies are: + +* First non-valid encounter replacement policy +* Random replacement policy + +First replacement policy failed if all ways are valid. Therefore, a +random replacement policy is opted for. + +The module implemented in CVA6 to find the first non-valid entry in the +shared TLB is the Leading Zero Counter (LZC). It takes three parameters +as input: + +1. *WIDTH:* The width of the input vector. +2. *MODE:* Mode selection - 0 for trailing zero, 1 for leading zero. +3. *CNT WIDTH:* Width of the output signal containing the zero count. + +The input signal is the vector to be counted, and the output represents +the count of trailing/leading zeros. If all bits in the input vector are +zero, it will also be indicated. + +When initializing the module, the width of the input vector is set to +the number of shared TLB ways. The trailing zero counter mode is +selected. The vector of valid bits is set as the input vector, but with +negation. This is because we want the index of the first non-valid +entry, and LZC returns the count of trailing zeros, which actually +corresponds to the index of the first occurrence of 1 from the least +significant bit (LSB). if there is at least one non-valid entry, that +entry is opted for the replacement, and If not then this is signaled by +LZC. + +image:LZC.png[*Figure 17:* Replacement of First invalid +entry.,scaledwidth=60.0%] + +If all ways are valid, a random replacement policy is employed for the +replacement process. The Linear Feedback Shift Register (LFSR) is +utilized to select the replacement entry randomly. LFSR is commonly used +in generating sequences of pseudo-random numbers. When the enable signal +is active, the current state of the LFSR undergoes a transformation. +Specifically, the state is shifted right by one bit, and the result is +combined with a predetermined masking pattern. This masking pattern is +derived from the predefined “Masks” array, introducing a non-linear +behavior to the sequence generation of the LFSR. The masking process +involves XOR operations between the shifted state bits and specific +pattern bits, contributing to the complexity and unpredictability of the +generated sequence. + +image:RR.png[*Figure 18:* Entry selection for replacement +using LFSR,scaledwidth=95.0%] + +[[page-table-walker]] +Page Table Walker +----------------- + +The "CVA6 Page Table Walker (PTW) for MMU Sv32" is a hardware module +developed for the CVA6 processor architecture, designed to facilitate +the translation of virtual addresses into physical addresses, a crucial +task in memory access management. + +image:ptw_in_out.png[*Figure 19:* Input and Outputs of Page +Table Walker,scaledwidth=60.0%] + +The PTW module operates through various states, each with its specific +function, such as handling memory access requests, validating page table +entries, and responding to errors. + +Key features of this PTW module include support for two levels of page +tables (LVL1 and LVL2) in the Sv32 standard, accommodating instruction +and data page table walks. It rigorously validates and verifies page +table entries (PTEs) to ensure translation accuracy and adherence to +access permissions. This module seamlessly integrates with the CVA6 +processor's memory management unit (MMU), which governs memory access +control. It also takes into account global mapping, access flags, and +privilege levels during the translation process, ensuring that memory +access adheres to the processor's security and privilege settings. + +In addition to its translation capabilities, the PTW module is equipped +to detect and manage errors, including page-fault exceptions and access +exceptions, contributing to the robustness of the memory access system. +It works harmoniously with physical memory protection (PMP) +configurations, a critical aspect of modern processors' memory security. +Moreover, the module efficiently processes virtual addresses, generating +corresponding physical addresses, all while maintaining speculative +translation, a feature essential for preserving processor performance +during memory access operations. + +[cols=",,,,",options="header",] +|======================================================================= +|Signal |IO |Connection |Type |Description +|`clk_i` |in |Subsystem |logic |Subsystem Clock + +|`rst_ni` |in |Subsystem |logic |Asynchronous reset active low + +|`flush_i` |in |Controller |logic |Sfence Committed + +|`ptw_active_o` |out |MMU |logic |Output signal indicating whether the +Page Table Walker (PTW) is currently active + +|`walking_instr_o` |out |MMU |logic |Indicating it's an instruction page +table walk or not + +|`ptw_error_o` |out |MMU |logic |Output signal indicating that an error +occurred during PTW operation + +|`ptw_access_exception_o` |out |MMU |logic |Output signal indicating +that a PMP (Physical Memory Protection) access exception occurred during +PTW operation. + +|`lsu_is_store_i` |in |Store Unit |logic |Input signal indicating +whether the translation was triggered by a store operation. + +|`req_port_i` |in |Cache Subsystem |dcache_req_o_t |D Cache Data +Requests + +|`req_port_o` |out |Cache Subsystem / Perf Counter |dcache_req_u_t |D +Cache Data Response + +|`shared_tlb_update_o` |out |Shared TLB |tlb_update_sv32_t |Updated tag +and content of shared TLB + +|`update_vaddr_o` |out |MMU |logic[riscv::VLEN-1:0] |Updated VADDR from +shared TLB + +|`asid_i` |in |CSR RegFile |logic[ASID_WIDTH-1:0] |ASID for the lookup + +|`shared_tlb_access_i` |in |Shared TLB |logic |Access request of shared +TLB + +|`shared_tlb_hit_i` |in |Shared TLB |logic |Indicate shared TLB hit + +|`shared_tlb_vaddr_i` |in |Shared TLB |logic[riscv::VLEN-1:0] |Virtual +Address from shared TLB + +|`itlb_req_i` |in |Shared TLB |logic |Indicate request to ITLB + +|`satp_ppn_i` |in |CSR RegFile |logic[riscv::PPNW-1:0] |PPN of top level +page table from SATP register + +|`mxr_i` |in |CSR RegFile |logic |Make Executable Readable bit in +xSTATUS CSR register + +|`shared_tlb_miss_o` |out |OPEN |logic |Indicate a shared TLB miss + +|`pmpcfg_i` |in |CSR RegFile |riscv::pmpcfg_t[15:0] |PMP configuration + +|`pmpaddr_i` |in |CSR RegFile |logic[15:0][riscv::PLEN-3:0] |PMP Address + +|`bad_paddr_o` |out |MMU |logic[riscv::PLEN-1:0] |Bad Physical Address +in case of access exception +|======================================================================= + +[cols=",,",options="header",] +|======================================================================= +|Signal |Type |Description +|`address_index` |logic [DCACHE_INDEX_WIDTH-1:0] |Index of the Dcache +Line + +|`address_tag` |logic [DCACHE_TAG_WIDTH-1:0] |Tag of the Dcache Line + +|`data_wdata` |riscv::xlen_t |Data to write in the Dcache + +|`data_wuser` |logic [DCACHE_USER_WIDTH-1:0] |data_wuser + +|`data_req` |logic |Data Request + +|`data_we` |logic |Data Write enabled + +|`data_be` |logic [(riscv::XLEN/8)-1:0] |Data Byte enable + +|`data_size` |logic [1:0] |Size of data + +|`data_id` |logic [DCACHE_TID_WIDTH-1:0] |Data ID + +|`kill_req` |logic |Kill the D cache request + +|`tag_valid` |logic |Indicate that teh tag is valid +|======================================================================= + +[cols=",,",options="header",] +|======================================================================= +|Signal |Type |Description +|`data_gnt` |logic |Grant of data is given in response to the data +request + +|`data_rvalid` |logic |Indicate that data is valid which is sent by D +cache + +|`data_rid` |logic [DCACHE_TID_WIDTH-1:0] |Requested data ID + +|`data_rdata` |riscv::xlen_t |Data from D cache + +|`data_ruser` |logic [DCACHE_USER_WIDTH-1:0] |Requested data user +|======================================================================= + +Page Table Walker is implemented as a finite state machine. It listens +to shared TLB for incoming translation requests. If there is a shared +TLB miss, it saves the virtual address and starts the page table walk. +Page table walker transition between 7 states in CVA6. + +* *IDLE:* The initial state where the PTW is awaiting a trigger, often a +Shared TLB miss, to initiate a memory access request. +* *WAIT_GRANT:* Request memory access and wait for data grant +* *PTE_LOOKUP:* Once granted access, the PTW examines the valid Page +Table Entry (PTE), checking attributes to determine the appropriate +course of action. +* *PROPOGATE_ERROR:* If the PTE is invalid, this state handles the +propagation of an error, often leading to a page-fault exception due to +non-compliance with access conditions +* *PROPOGATE_ACCESS_ERROR:* Propagate access fault if access is not +allowed from a PMP perspective +* *WAIT_RVALID:* After processing a PTE, the PTW waits for a valid data +signal, indicating that relevant data is ready for further processing. +* *LATENCY:* Introduces a delay to account for synchronization or timing +requirements between states. + +image:ptw_state_diagram.png[*Figure 20:* State Machine Diagram +of CVA6 PTW,scaledwidth=95.0%] + +In the IDLE state of the Page Table Walker (PTW) finite state machine, +the system awaits a trigger to initiate the page table walk process. +This trigger is often prompted by a Shared Translation Lookaside Buffer +(TLB) miss, indicating that the required translation is not present in +the shared TLB cache. The PTW's behavior in this state is explained as +follows: + +1. The top-most page table is selected for the page table walk. In the +case of SV32, which implements a two-level page table, the level 1 page +table is chosen. +2. In the IDLE state, translations are assumed to be invalid in all +addressing spaces. +3. The signal indicating the instruction page table walk is set to 0. +4. A conditional check is performed: if there is a shared TLB access +request and the entry is not found in the shared TLB (indicating a +shared TLB miss), the following steps are executed: +a. The address of the desired Page Table Entry within the level 1 page +table is calculated by multiplying the Physical Page Number (PPN) of the +level 1 page table from the SATP register by the page size (4kB). This +result is then added to the product of the Virtual Page Number (VPN1), +and the size of a page table entry(4 bytes). + +image:ptw_idle.png[*Figure 21:* Address of Desired PTE at +Level 1,scaledwidth=68.0%] + +In the *WAIT_GRANT* state of the Page Table Walker's finite state +machine, a data request is sent to retrieve memory information. It waits +for a data grant signal from the Dcache controller, remaining in this +state until granted. Once granted, it activates a tag valid signal, +marking data validity. The state then transitions to "PTE_LOOKUP" for +page table entry lookup. + +In the *PTE_LOOKUP* state of the Page Table Walker (PTW) finite state +machine, the PTW performs the actual lookup and evaluation of the page +table entry (PTE) based on the virtual address translation. The behavior +and operations performed in this state are detailed as follows: + +1. The state waits for a valid signal indicating that the data from the +memory subsystem, specifically the page table entry, is available for +processing. +2. Upon receiving the valid signal, the PTW proceeds with examining the +retrieved page table entry to determine its properties and validity. +3. The state checks if the global mapping bit in the PTE is set, and if +so, sets the global mapping signal to indicate that the translation +applies globally across all address spaces. +4. The state distinguishes between two cases: Invalid PTE and Valid +PTE. +a. If the valid bit of the PTE is not set, or if the PTE has reserved +RWX field encodings, it signifies an Invalid PTE. In such cases, the +state transitions to the "PROPAGATE_ERROR" state, indicating a +page-fault exception due to an invalid translation. + +image:ptw_pte_1.png[*Figure 22:* Invalid PTE and reserved RWX +encoding leads to page fault,scaledwidth=70.0%] + +1. Within the Valid PTE scenario, the state performs further checks +based on whether the translation is intended for instruction fetching or +data access: +a. For instruction page table walk, if the page is not executable +(pte.x is not set) or not marked as accessible (pte.a is not set), the +state transitions to the "PROPAGATE_ERROR" state. + +image:ptw_iptw.png[*Figure 23:* For Instruction Page Table +Walk,scaledwidth=70.0%] + +image:ptw_dptw.png[*Figure 24:* Data Access Page Table +Walk,scaledwidth=70.0%] + +image:ptw_dptw_s.png[*Figure 25:* Data Access Page Table Walk, +Store requested,scaledwidth=70.0%] + +1. The state also checks for potential misalignment issues in the +translation: If the current page table level is the first level (LVL1) +and if the PPN0 of in PTE is not zero, it indicates a misaligned +superpage, leading to a transition to the "PROPAGATE_ERROR" state. + +image:ptw_mis_sup.png[*Figure 26:* Misaligned Superpage +Check,scaledwidth=70.0%] + +1. If the PTE is valid but the page is neither readable nor executable, +the PTW recognizes the PTE as a pointer to the next level of the page +table, indicating that additional translation information can be found +in the referenced page table at a lower level. +2. If the current page table level is the first level (LVL1), the PTW +proceeds to switch to the second level (LVL2) page table, updating the +next level pointer and calculating the address for the next page table +entry using the Physical Page Number from the PTE and the index of the +level 2 page table from virtual address. + +image:ptw_nlvl.png[*Figure 27:* Address of desired PTE at next +level of Page Table,scaledwidth=70.0%] + +1. The state then transitions to the "WAIT_GRANT" state, indicating +that the PTW is awaiting the grant signal to proceed with requesting the +next level page table entry. +2. If the current level is already the second level (LVL2), an error is +flagged, and the state transitions to the "PROPAGATE_ERROR" state, +signifying an unexpected situation where the PTW is already at the last +level page table. +3. If the translation access is found to be restricted by the Physical +Memory Protection (PMP) settings (allow_access is false), the state +updates the shared TLB update signal to indicate that the TLB entry +should not be updated. Additionally, the saved address for the page +table walk is restored to its previous value, and the state transitions +to the "PROPAGATE_ACCESS_ERROR" state. +4. Lastly, if the data request for the page table entry was granted, +the state indicates to the cache subsystem that the tag associated with +the data is now valid. + +image:ptw_pte_flowchart.png[*Figure 28:* Flow Chart of PTE +LOOKUP State] + +This state indicates a detected error in the page table walk process, +and an error signal is asserted to indicate the Page Table Walker's +error condition, triggering a transition to the "LATENCY" state for +error signal propagation. + +This state indicates a detected access error in the page table walk +process, and an access error signal is asserted to indicate the Page +Table Walker's access error condition, triggering a transition to the +"LATENCY" state for access error signal propagation. + +This state waits until it gets the "read valid" signal, and when it +does, it's ready to start a new page table walk. + +The LATENCY state introduces a latency period to allow for necessary +system actions or signals to stabilize. After the latency period, the +FSM transitions back to the IDLE state, indicating that the system is +prepared for a new translation request. + +The first step when a flush is triggered is to check whether the Page +Table Entry (PTE) lookup process is currently in progress. If the PTW +(Page Table Walker) module is indeed in the middle of a PTE lookup +operation, the code then proceeds to evaluate a specific aspect of this +operation. + +* *Check for Data Validity (rvalid):* Within the PTE lookup operation, +it's important to ensure that the data being used for the translation is +valid. In other words, the code checks whether the "rvalid" signal +(which likely indicates the validity of the data) is not active. If the +data is not yet valid, it implies that the PTW module is waiting for the +data to become valid before completing the lookup. In such a case, the +code takes appropriate action to wait for the data to become valid +before proceeding further. +* *Check for Waiting on Grant:* The second condition the code checks for +during a flush scenario is whether the PTW module is currently waiting +for a "grant." This "grant" signal is typically used to indicate +permission or authorization to proceed with an operation. If the PTW +module is indeed in a state of waiting for this grant signal, it implies +that it requires authorization before continuing its task. ++ +__________________________________________________________________________________________________________________________________________________________________________________________________ +** *Waiting for Grant:* If the PTW module is in a state of waiting for +the grant signal, the code ensures that it continues to wait for the +grant signal to be asserted before proceeding further. +__________________________________________________________________________________________________________________________________________________________________________________________________ +* *Return to Idle State if Neither Condition is Met:* After evaluating +the above two conditions, the code determines whether either of these +conditions is true. If neither of these conditions applies, it suggests +that the PTW module can return to its idle state, indicating that it can +continue normal operations without any dependencies on the flush +condition. diff --git a/docs/04_cv32a65x/design/source/subsystem.rst b/docs/design/design-manual/source/subsystem.adoc similarity index 57% rename from docs/04_cv32a65x/design/source/subsystem.rst rename to docs/design/design-manual/source/subsystem.adoc index 4f507d71bc..a28862511e 100644 --- a/docs/04_cv32a65x/design/source/subsystem.rst +++ b/docs/design/design-manual/source/subsystem.adoc @@ -1,4 +1,4 @@ -.. +//// Copyright 2022 Thales DIS design services SAS Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -6,25 +6,30 @@ You may obtain a copy of the License at https://solderpad.org/licenses/ Original Author: Jean-Roch COULON - Thales +//// - - +[[subsystem]] Subsystem -========= +--------- +[[global-functionality]] Global functionality --------------------- +~~~~~~~~~~~~~~~~~~~~ -The CVA6 is a subsystem composed of the modules and protocol interfaces as illustrated -The processor is a Harvard-based modern architecture. -Instructions are issued in-order through the DECODE stage and executed out-of-order but committed in-order. -The processor is Single issue, that means that at maximum one instruction per cycle can be issued to the EXECUTE stage. +The CVA6 is a subsystem composed of the modules and protocol interfaces +as illustrated The processor is a Harvard-based modern architecture. +Instructions are issued in-order through the DECODE stage and executed +out-of-order but committed in-order. The processor is Single issue, that +means that at maximum one instruction per cycle can be issued to the +EXECUTE stage. -The CVA6 implements a 6-stage pipeline composed of PC Generation, Instruction Fetch, Instruction Decode, Issue stage, Execute stage and Commit stage. -At least 6 cycles are needed to execute one instruction. +The CVA6 implements a 6-stage pipeline composed of PC Generation, +Instruction Fetch, Instruction Decode, Issue stage, Execute stage and +Commit stage. At least 6 cycles are needed to execute one instruction. +[[connection-with-other-sub-systems]] Connection with other sub-systems ---------------------------------- +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ The submodule is connected to : @@ -33,16 +38,14 @@ The submodule is connected to : * TRACER provides support for verification * TRAP provides traps inputs - +[[parameter-configuration]] Parameter configuration ------------------------ - - -.. include:: parameters_cv32a65x.rst - +~~~~~~~~~~~~~~~~~~~~~~~ +include::parameters.adoc[] +[[io-ports]] IO ports --------- +~~~~~~~~ -.. include:: port_cva6.rst +include::port_cva6.adoc[] diff --git a/docs/04_cv32a65x/design/source/CVXIF.rst b/docs/design/design-manual/source/traps.adoc similarity index 84% rename from docs/04_cv32a65x/design/source/CVXIF.rst rename to docs/design/design-manual/source/traps.adoc index 94dc7e4f16..fd6f42ef43 100644 --- a/docs/04_cv32a65x/design/source/CVXIF.rst +++ b/docs/design/design-manual/source/traps.adoc @@ -1,4 +1,4 @@ -.. +//// Copyright 2023 Thales DIS France SAS Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -6,6 +6,10 @@ You may obtain a copy of the License at https://solderpad.org/licenses/ Original Author: Jean-Roch COULON - Thales +//// + +[[traps]] + +include::Traps_Interrupts_Exceptions.adoc[] -.. include:: ../../../01_cva6_user/CVX_Interface_Coprocessor.rst diff --git a/docs/index.rst b/docs/index.rst index c8ba58bc72..ce38770a99 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -56,7 +56,7 @@ The target audience of this document is current and existing members of the Open The :doc:`CVA6 Design Document <03_cva6_design/index>` describes in detail the **CVA6**, the code base that can be used to compile/synthesize a specific core instance (e.g. cv32a65x). -The :doc:`CV32A65X Design Document <04_cv32a65x/design/source/index>` describes in detail the **CV32A65X**, a specific core based on the CVA6 and the first production quality 32-bit application processor derived from the CVA6. +The :doc:`CV32A65X Design Document <04_cv32a65x/design/design>` describes in detail the **CV32A65X**, a specific core based on the CVA6 and the first production quality 32-bit application processor derived from the CVA6. The primary audience for this documentation are design and verification engineers working to bring the CV32A65X to TRL-5. The :doc:`CVA6 APU <05_cva6_apu/index>` describes an Application Processor Unit built around the CVA6. @@ -69,5 +69,5 @@ The :doc:`CVA6 APU <05_cva6_apu/index>` describes an Application Processor Unit 01_cva6_user/index.rst 03_cva6_design/index.rst 04_cv32a65x/index.rst + 06_cv64a6_mmu/index.rst 05_cva6_apu/index.rst - diff --git a/docs/make.bat b/docs/make.bat deleted file mode 100644 index 2119f51099..0000000000 --- a/docs/make.bat +++ /dev/null @@ -1,35 +0,0 @@ -@ECHO OFF - -pushd %~dp0 - -REM Command file for Sphinx documentation - -if "%SPHINXBUILD%" == "" ( - set SPHINXBUILD=sphinx-build -) -set SOURCEDIR=. -set BUILDDIR=_build - -if "%1" == "" goto help - -%SPHINXBUILD% >NUL 2>NUL -if errorlevel 9009 ( - echo. - echo.The 'sphinx-build' command was not found. Make sure you have Sphinx - echo.installed, then set the SPHINXBUILD environment variable to point - echo.to the full path of the 'sphinx-build' executable. Alternatively you - echo.may add the Sphinx directory to PATH. - echo. - echo.If you don't have Sphinx installed, grab it from - echo.http://sphinx-doc.org/ - exit /b 1 -) - -%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% -goto end - -:help -%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% - -:end -popd diff --git a/docs/riscv-isa/build.mk b/docs/riscv-isa/build.mk index 5828ee4271..27753d16b5 100644 --- a/docs/riscv-isa/build.mk +++ b/docs/riscv-isa/build.mk @@ -10,6 +10,8 @@ ifeq ($(CONFIG),) $(error CONFIG must be defined) endif +current_dir = $(shell pwd) + # Path of current file, intended to be included by a configuration subfolder riscv-isa_dir := $(dir $(lastword $(MAKEFILE_LIST))) @@ -17,9 +19,12 @@ all: priv-pdf priv-html unpriv-pdf unpriv-html setup: mkdir -p build/riscv-isa-manual + cp -r $(riscv-isa_dir)/riscv-isa-manual/* build/riscv-isa-manual cp -r $(riscv-isa_dir)/src build/riscv-isa-manual - cp -r src build/riscv-isa-manual + cp $(riscv-isa_dir)/../common/*.adoc build/riscv-isa-manual/src + + cd ../.. && python3 scripts/spec_builder.py --target $(CONFIG) --gen-config $(current_dir)/build/riscv-isa-manual/src/config.adoc priv-pdf: setup cd build/riscv-isa-manual; make SKIP_DOCKER=true build/riscv-privileged.pdf diff --git a/docs/riscv-isa/riscv-isa-manual b/docs/riscv-isa/riscv-isa-manual index ebf2e3a0b4..2c07aa2bcc 160000 --- a/docs/riscv-isa/riscv-isa-manual +++ b/docs/riscv-isa/riscv-isa-manual @@ -1 +1 @@ -Subproject commit ebf2e3a0b402cd56fd4b571b705b31f3be62c2cc +Subproject commit 2c07aa2bcc02fd5fb2e53e42a32dc62a3eb0aa62 diff --git a/docs/riscv-isa/src/colophon.adoc b/docs/riscv-isa/src/colophon.adoc index d0f62d0784..d05ee78be3 100644 --- a/docs/riscv-isa/src/colophon.adoc +++ b/docs/riscv-isa/src/colophon.adoc @@ -7,7 +7,7 @@ This document describes the RISC-V unprivileged architecture tailored for OpenHW Group {ohg-config}. -[.big]*_Preface to Document Version 20240703_* +[.big]*_Preface to Document Version 20241017_* This document describes the RISC-V unprivileged architecture. @@ -40,8 +40,8 @@ h|Extension h|Version h|Status |*Zmmul* |*1.0* |*Ratified* |*A* |*2.1* |*Ratified* |*Zawrs* |*1.01* |*Ratified* -|*Zacas* |*1.0* |*Ratifed* -|*Zabha* |*1.0* |*Ratifed* +|*Zacas* |*1.0* |*Ratified* +|*Zabha* |*1.0* |*Ratified* |*RVWMO* |*2.0* |*Ratified* |*Ztso* |*1.0* |*Ratified* |*CMO* |*1.0* |*Ratified* diff --git a/docs/riscv-isa/src/counters.adoc b/docs/riscv-isa/src/counters.adoc index ed1342a1d5..ec0676006e 100644 --- a/docs/riscv-isa/src/counters.adoc +++ b/docs/riscv-isa/src/counters.adoc @@ -14,7 +14,7 @@ counters (CYCLE, TIME, and INSTRET), which have dedicated functions (cycle count, real-time clock, and instructions retired, respectively). The Zicntr extension depends on the Zicsr extension. -[TIP] +[NOTE] ==== We recommend provision of these basic counters in implementations as they are essential for basic performance analysis, adaptive and dynamic @@ -27,7 +27,7 @@ Some execution environments might prohibit access to counters, for example, to impede timing side-channel attacks. ==== -include::images/wavedrom/counters-diag.adoc[] +include::images/wavedrom/counters-diag.edn[] For base ISAs with XLEN≥64, CSR instructions can access @@ -35,7 +35,7 @@ the full 64-bit CSRs directly. In particular, the RDCYCLE, RDTIME, and RDINSTRET pseudoinstructions read the full 64 bits of the `cycle`, `time`, and `instret` counters. -[TIP] +[NOTE] ==== The counter pseudoinstructions are mapped to the read-only `csrrs rd, counter, x0` canonical form, but the other read-only CSR @@ -47,7 +47,7 @@ For base ISAs with XLEN=32, the Zicntr extension enables the three RDTIME, and RDINSTRET pseudoinstructions provide the lower 32 bits, and the RDCYCLEH, RDTIMEH, and RDINSTRETH pseudoinstructions provide the upper 32 bits of the respective counters. -[TIP] +[NOTE] ==== We required the counters be 64 bits wide, even when XLEN=32, as otherwise it is very difficult for software to determine if values have @@ -67,7 +67,7 @@ overflow in practice. The rate at which the cycle counter advances will depend on the implementation and operating environment. The execution environment should provide a means to determine the current rate (cycles/second) at which the cycle counter is incrementing. -[TIP] +[NOTE] ==== RDCYCLE is intended to return the number of cycles executed by the processor core, not the hart. Precisely defining what is a "core" is @@ -128,7 +128,7 @@ should be constant within a small error bound. The environment should provide a means to determine the accuracy of the clock (i.e., the maximum relative error between the nominal and actual real-time clock periods). -[TIP] +[NOTE] ==== On some simple platforms, cycle count might represent a valid implementation of RDTIME, in which case RDTIME and RDCYCLE may return @@ -141,7 +141,7 @@ bound should be set based on the requirements of the platform. The real-time clocks of all harts must be synchronized to within one tick of the real-time clock. -[TIP] +[NOTE] ==== As with other architectural mandates, it suffices to appear "as if" harts are synchronized to within one tick of the real-time clock, i.e., @@ -154,7 +154,7 @@ hart from some arbitrary start point in the past. RDINSTRETH is only present when XLEN=32 and reads bits 63-32 of the same instruction counter. The underlying 64-bit counter should never overflow in practice. -[TIP] +[NOTE] ==== Instructions that cause synchronous exceptions, including ECALL and EBREAK, are not considered to retire and hence do not increment the @@ -181,7 +181,7 @@ hardware performance counters, `hpmcounter3-hpmcounter31`. When XLEN=32, the upper 32 bits of these performance counters are accessible via additional CSRs `hpmcounter3h- hpmcounter31h`. The Zihpm extension depends on the Zicsr extension. -[TIP] +[NOTE] ==== In some applications, it is important to be able to read multiple counters at the same instant in time. When run under a multitasking @@ -203,7 +203,7 @@ exception or may return a constant value. The execution environment should provide a means to determine the number and width of the implemented counters, and an interface to configure the events to be counted by each counter. -[TIP] +[NOTE] ==== For execution environments implemented on RISC-V privileged platforms, the privileged architecture manual describes privileged CSRs controlling diff --git a/docs/riscv-isa/src/machine.adoc b/docs/riscv-isa/src/machine.adoc index 41f8e1468a..651e349645 100644 --- a/docs/riscv-isa/src/machine.adoc +++ b/docs/riscv-isa/src/machine.adoc @@ -300,7 +300,7 @@ endif::[] //image::png/mvendorid.png[align="center"] .Vendor ID register (`mvendorid`) -include::images/bytefield/mvendorid.adoc[] +include::images/bytefield/mvendorid.edn[] ifdef::archi-default[] JEDEC manufacturer IDs are ordinarily encoded as a sequence of one-byte @@ -348,7 +348,7 @@ of the hart supplied to CVA6, 0x3. endif::[] .Machine Architecture ID (`marchid`) register -include::images/bytefield/marchid.adoc[] +include::images/bytefield/marchid.edn[] ifdef::archi-default[] Open-source project architecture IDs are allocated globally by RISC-V @@ -410,7 +410,7 @@ processor itself and not any surrounding system. endif::[] .Machine Implementation ID (`mimpid`) register -include::images/bytefield/mimpid.adoc[] +include::images/bytefield/mimpid.edn[] ifeval::[{note} == true] [NOTE] @@ -443,7 +443,7 @@ Hart ID is zero. endif::[] .Hart ID (`mhartid`) register -include::images/bytefield/mhartid.adoc[] +include::images/bytefield/mhartid.edn[] ifeval::[{note} == true] [NOTE] @@ -487,13 +487,13 @@ endif::[] ifdef::archi-default,XLEN-32[] [[mstatusreg-rv32]] .Machine-mode status (`mstatus`) register for RV32 -include::images/wavedrom/mstatusreg-rv321.adoc[] +include::images/wavedrom/mstatusreg-rv321.edn[] endif::[] ifdef::archi-default,XLEN-64[] [[mstatusreg]] .Machine-mode status (`mstatus`) register for RV64 -include::images/wavedrom/mstatusreg.adoc[] +include::images/wavedrom/mstatusreg.edn[] endif::[] ifdef::archi-default[] @@ -509,7 +509,7 @@ endif::[] ifdef::archi-default,XLEN-32[] [[mstatushreg]] .Additional machine-mode status (`mstatush`) register for RV32. -include::images/wavedrom/mstatushreg.adoc[] +include::images/wavedrom/mstatushreg.edn[] endif::[] [[privstack]] @@ -676,9 +676,11 @@ by the same write (For RV32, the `MDT` bit is in `mstatush` and the `MIE` bit in When a trap is to be taken into M-mode, if the `MDT` bit is currently 0, it is then set to 1, and the trap is delivered as expected. However, if `MDT` is -already set to 1, then this is an _unexpected trap_. Additionally, when the -Smrnmi extension is implemented, a trap that occurs when executing in M-mode -with the `mnstatus.NMIE` set to 0 is an _unexpected trap_. +already set to 1, then this is an _unexpected trap_. When the Smrnmi extension +is implemented, a trap caused by an RNMI is not considered an _unexpected trap_ +irrespective of the state of the `MDT` bit. A trap caused by an RNMI does not +set the `MDT` bit. However, a trap that occurs when executing in M-mode with +`mnstatus.NMIE` set to 0 is an _unexpected trap_. In the event of a _unexpected trap_, the handling is as follows: @@ -723,6 +725,9 @@ The `MRET` and `SRET` instructions, when executed in M-mode, set the `MDT` bit to 0. If the new privilege mode is U, VS, or VU, then `sstatus.SDT` is also set to 0. Additionally, if it is VU, then `vsstatus.SDT` is also set to 0. +The `MNRET` instruction, provided by the Smrnmi extension, sets the `MDT` bit to +0 if the new privilege mode is not M. If it is U, VS, or VU, then `sstatus.SDT` is +also set to 0. Additionally, if it is VU, then `vsstatus.SDT` is also set to 0. endif::[] ifndef::archi-default,RVZsmdbltrp-true[] @@ -1601,7 +1606,7 @@ and a vector mode (MODE). .Encoding of mtvec MODE field. -include::images/bytefield/mtvec.adoc[] +include::images/bytefield/mtvec.edn[] ifdef::archi-default[] The `mtvec` register must always be implemented, but can contain a @@ -1769,7 +1774,7 @@ endif::[] ifdef::archi-default,RVS-true[] .Machine Exception Delegation (`medeleg`) register. -include::images/bytefield/medeleg.adoc[] +include::images/bytefield/medeleg.edn[] `medeleg` has a bit position allocated for every synchronous exception shown in <>, with the index of the @@ -1789,7 +1794,7 @@ endif::[] ifdef::archi-default,RVS-true[] .Machine Interrupt Delegation (`mideleg`) Register. -include::images/bytefield/mideleg.adoc[] +include::images/bytefield/mideleg.edn[] `mideleg` holds trap delegation bits for individual interrupts, with the layout of bits matching those in the `mip` register (i.e., STIP @@ -1828,10 +1833,10 @@ at the platform's discretion. endif::[] .Machine Interrupt-Pending (`mip`) register. -include::images/bytefield/mideleg.adoc[] +include::images/bytefield/mideleg.edn[] .Machine Interrupt-Enable (`mie`) register -include::images/bytefield/mideleg.adoc[] +include::images/bytefield/mideleg.edn[] ifdef::archi-default[] An interrupt _i_ will trap to M-mode (causing the privilege mode to @@ -1878,11 +1883,11 @@ formatted as shown in <> and <> respectively. [[mipreg-standard]] .Standard portion (bits 15:0) of `mip`. -include::images/bytefield/mipreg-standard.adoc[] +include::images/bytefield/mipreg-standard.edn[] [[miereg-standard]] .Standard portion (bits 15:0) of `mie`. -include::images/bytefield/miereg-standard.adoc[] +include::images/bytefield/miereg-standard.edn[] endif::[] ifeval::[{note} == true] @@ -2050,11 +2055,12 @@ formatted as shown in <> and <> respectively. [[mipreg-standard]] .Standard portion (bits 15:0) of `mip`. -include::images/bytefield/mipreg-standard.adoc[] +include::images/bytefield/mipreg-standard.edn[] [[miereg-standard]] .Standard portion (bits 15:0) of `mie`. -include::images/bytefield/miereg-standard.adoc[] +include::images/bytefield/miereg-standard.edn[] +endif::[] [{ohg-config}] Bits `mip`.MEIP and `mie`.MEIE are the interrupt-pending and @@ -2151,7 +2157,7 @@ endif::[] ifdef::archi-default,RVZsmcntrpmf-true[] .Hardware performance monitor counters. -include::images/bytefield/hpmevents.adoc[] +include::images/bytefield/hpmevents.edn[] The `mhpmcounters` are *WARL* registers that support up to 64 bits of precision on RV32 and RV64. @@ -2185,7 +2191,7 @@ these events is defined by the platform, but event 0 is defined to mean selector are read-only 0. .Hardware performance monitor counters. -include::images/bytefield/hpmevents.adoc[] +include::images/bytefield/hpmevents.edn[] endif::[] The `mhpmcounters` are *WARL* registers that support up to 64 bits of @@ -2219,7 +2225,7 @@ counters to the next-lower privileged mode. ifdef::archi-default,RVU-true[] .Counter-enable (`mcounteren`) register. -include::images/bytefield/counteren.adoc[] +include::images/bytefield/counteren.edn[] The settings in this register only control accessibility. The act of reading or writing this register does not affect the underlying @@ -2249,9 +2255,9 @@ ifdef::archi-default[] The `cycle`, `instret`, and `hpmcountern` CSRs are read-only shadows of `mcycle`, `minstret`, and `mhpmcounter n`, respectively. The `time` CSR is a read-only shadow of the memory-mapped `mtime` register. -Analogously, on RV32I the `cycleh`, `instreth` and `hpmcounternh` CSRs +Analogously, when XLEN=32, the `cycleh`, `instreth` and `hpmcounternh` CSRs are read-only shadows of `mcycleh`, `minstreth` and `mhpmcounternh`, -respectively. On RV32I the `timeh` CSR is a read-only shadow of the +respectively. When XLEN=32, the `timeh` CSR is a read-only shadow of the upper 32 bits of the memory-mapped `mtime` register, while `time` shadows only the lower 32 bits of `mtime`. endif::[] @@ -2280,7 +2286,7 @@ endif::[] ==== Machine Counter-Inhibit (`mcountinhibit`) Register .Counter-inhibit `mcountinhibit` register -include::images/bytefield/counterinh.adoc[] +include::images/bytefield/counterinh.edn[] ifdef::archi-default,RVZsmcntrpmf-true[] The counter-inhibit register `mcountinhibit` is a 32-bit *WARL* register that @@ -2328,7 +2334,7 @@ machine-mode hart-local context space and swapped with a user register upon entry to an M-mode trap handler. .Machine-mode scratch register. -include::images/bytefield/mscratch.adoc[] +include::images/bytefield/mscratch.edn[] ifeval::[{note} == true] [NOTE] @@ -2389,7 +2395,7 @@ though it may be explicitly written by software. [[mepcreg]] .Machine exception program counter register. -include::images/bytefield/mepcreg.adoc[] +include::images/bytefield/mepcreg.edn[] [[mcause]] ==== Machine Cause (`mcause`) Register @@ -2408,7 +2414,7 @@ the possible machine-level exception codes. The Exception Code is a [[mcausereg]] .Machine Cause (`mcause`) register. -include::images/bytefield/mcausereg.adoc[] +include::images/bytefield/mcausereg.edn[] ifdef::archi-default,RVU-true[] Note that load and load-reserved instructions generate load exceptions, @@ -2534,6 +2540,9 @@ _Designated for platform use_ 0 + 0 + 0 + +0 + +0 + +0 + 0 |0 + 1 + @@ -2717,7 +2726,7 @@ particularly those with hardware page-table walkers. [[mtvalreg]] .Machine Trap Value (`mtval`) register. -include::images/bytefield/mtvalreg.adoc[] +include::images/bytefield/mtvalreg.edn[] If `mtval` is written with a nonzero value when a misaligned load or @@ -2810,7 +2819,7 @@ and their configuration. [[mconfigptrreg]] .Machine Configuration Pointer (`mconfigptr`) register. -include::images/bytefield/mconfigptrreg.adoc[] +include::images/bytefield/mconfigptrreg.edn[] The pointer alignment in bits must be no smaller than MXLEN: @@ -2858,7 +2867,7 @@ privileged than M. [[menvcfgreg]] .Machine environment configuration (`menvcfg`) register. -include::images/wavedrom/menvcfgreg.adoc[] +include::images/wavedrom/menvcfgreg.edn[] If bit FIOM (Fence of I/O implies Memory) is set to one in `menvcfg`, @@ -2996,9 +3005,7 @@ ifdef::archi-CVA6+RVU-true[] endif::[] ifdef::archi-default,RVU-true[] -The definition of the PMM field will be furnished by the forthcoming -Smnpm extension. Its allocation within `menvcfg` may change prior to the -ratification of that extension. +The definition of the PMM field is furnished by the Smnpm extension. endif::[] ifdef::archi-CVA6+RVU-true[] @@ -3029,9 +3036,11 @@ the following rules apply to privilege modes that are less than M: * 32-bit Zicfiss instructions will revert to their behavior as defined by Zimop. * 16-bit Zicfiss instructions will revert to their behavior as defined by Zcmop. * The `pte.xwr=010b` encoding in VS/S-stage page tables becomes reserved. -* The `henvcfg.SSE` and `senvcfg.SSE` fields will read as zero and are read-only. * `SSAMOSWAP.W/D` raises an illegal-instruction exception. +When `menvcfg.SSE` is 0, the `henvcfg.SSE` and `senvcfg.SSE` fields are +read-only zero. + The Ssdbltrp extension adds the double-trap-enable (`DTE`) field in `menvcfg`. When `menvcfg.DTE` is zero, the implementation behaves as though Ssdbltrp is not implemented. When Ssdbltrp is not implemented `sstatus.SDT`, `vsstatus.SDT`, and @@ -3069,19 +3078,15 @@ shown in <>, that controls security features. [[mseccfg]] .Machine security configuration (`mseccfg`) register. -include::images/wavedrom/mseccfg.adoc[] +include::images/wavedrom/mseccfg.edn[] -The definitions of the SSEED and USEED fields will be furnished by the -forthcoming entropy-source extension, Zkr. Their allocations within -`mseccfg` may change prior to the ratification of that extension. +The definitions of the SSEED and USEED fields are furnished by the +entropy-source extension, Zkr. -The definitions of the RLB, MMWP, and MML fields will be furnished by -the forthcoming PMP-enhancement extension, Smepmp. Their allocations -within `mseccfg` may change prior to the ratification of that extension. +The definitions of the RLB, MMWP, and MML fields are furnished by the +PMP-enhancement extension, Smepmp. -The definition of the PMM field will be furnished by the forthcoming -Smmpm extension. Its allocation within `mseccfg` may change prior to the -ratification of that extension. +The definition of the PMM field is furnished by the Smmpm extension. The Zicfilp extension adds the `MLPE` field in `mseccfg`. When `MLPE` field is 1, Zicfilp extension is enabled in M-mode. When the `MLPE` field is 0, the @@ -3134,10 +3139,10 @@ writing `mtimecmp`). The interrupt will only be taken if interrupts are enabled and the MTIE bit is set in the `mie` register. .Machine time register (memory-mapped control register). -include::images/bytefield/mtime.adoc[] +include::images/bytefield/mtime.edn[] .Machine time compare register (memory-mapped control register). -include::images/bytefield/mtimecmp.adoc[] +include::images/bytefield/mtimecmp.edn[] ifeval::[{note} == true] [NOTE] @@ -3207,13 +3212,21 @@ ifdef::archi-default,XLEN-32[] endif::[] +ifdef::archi-default,RVU-true[] +The `time` CSR is a read-only shadow of the memory-mapped `mtime` register. +When XLEN=32, the `timeh` CSR is a read-only shadow of the upper 32 bits of the +memory-mapped `mtime` register, while `time` shadows only the lower 32 bits of +`mtime`. +When `mtime` changes, it is guaranteed to be reflected in `time` and `timeh` +eventually, but not necessarily immediately. +endif::[] === Machine-Mode Privileged Instructions ==== Environment Call and Breakpoint -include::images/wavedrom/mm-env-call.adoc[] +include::images/wavedrom/mm-env-call.edn[] ifdef::archi-default,RVU-true[] The ECALL instruction is used to make a request to the supporting @@ -3266,7 +3279,7 @@ not increment the `minstret` CSR. Instructions to return from trap are encoded under the PRIV minor opcode. -include::images/wavedrom/trap-return.adoc[] +include::images/wavedrom/trap-return.edn[] ifdef::archi-default,RVU-true[] To return after handling a trap, there are separate trap return @@ -3334,7 +3347,7 @@ cannot raise an illegal-instruction exception because TW=0 in `mstatus`, as described in <>. endif::[] -include::images/wavedrom/wfi.adoc[] +include::images/wavedrom/wfi.edn[] If an enabled interrupt is present or later becomes present while the hart is stalled, the interrupt trap will be taken on the following @@ -3429,7 +3442,7 @@ minimum required privilege mode, as do other SYSTEM instructions. [[customsys]] .SYSTEM instruction encodings designated for custom use. -include::images/bytefield/cust-sys-instr.adoc[] +include::images/bytefield/cust-sys-instr.edn[] [[reset]] === Reset @@ -3448,7 +3461,9 @@ the platform mandates a different reset value for some PMP registers’ A and L fields. If the hypervisor extension is implemented, the `hgatp`.MODE and `vsatp`.MODE fields are reset to 0. If the Smrnmi extension is implemented, the `mnstatus`.NMIE field is reset to 0. No - *WARL* field contains an illegal value. All other hart state is UNSPECIFIED. + *WARL* field contains an illegal value. If the Zicfilp extension is +implemented, the `mseccfg`.MLPE field is reset to 0. All other hart +state is UNSPECIFIED. The `mcause` values after reset have implementation-specific interpretation, but the value 0 should be returned on implementations @@ -3621,31 +3636,13 @@ endif::[] ifdef::archi-CVA6[] -[{ohg-config}] PMAs are inherent properties of the underlying hardware. The PMAs of -some memory regions are fixed at chip design time. - -[{ohg-config}] Some PMAs are dynamically -checked in hardware later in the execution pipeline after the physical -address is known, as some operations will not be supported at all -physical memory addresses, and some operations require knowing the -setting of a PMA attribute. - -[{ohg-config}] For RISC-V, we separate out specification and checking of PMAs into a -separate hardware structure, the _PMA checker_. In {ohg-config}, the -attributes are known at system design time for each physical address -region, and are hardwired into the PMA checker. -PMAs are checked for any access to physical memory, including accesses -that have undergone virtual to physical memory translation. To aid in -system debugging, we strongly recommend that, where possible, RISC-V -processors precisely trap physical memory accesses that fail PMA checks. -Precisely trapped PMA violations manifest as instruction, load, or store -access-fault exceptions, distinct from virtual-memory page-fault -exceptions. Precise PMA traps might not always be possible, for example, -when probing a legacy bus architecture that uses access failures as part -of the discovery mechanism. In this case, error responses from -peripheral devices will be reported as imprecise bus-error interrupts. - -[{ohg-config}] PMAs are not readable by software. +[{ohg-config}] PMA is not implemented by {ohg-config} but information +is sent outside the processor to be able to check PMA outside processor. +These checkers are based on the following information contained in the +memory accesses requested by the processor. +- The information which indicates whether memory access is read, write or +execution, +- The access length information to check the subword and subblock access rights. endif::[] ==== Main Memory versus I/O Regions @@ -3820,7 +3817,7 @@ Specific supported values for this PMA are represented by MAG__NN__, e.g., MAG16 indicates the misaligned atomicity granule is at least 16 bytes. The misaligned atomicity granule PMA applies only to AMOs, loads and stores -defined in the base ISAs, and loads and stores of no more than MXLEN bits +defined in the base ISAs, and loads and stores of no more than XLEN bits defined in the F, D, and Q extensions. For an instruction in that set, if all accessed bytes lie within the same misaligned atomicity granule, the instruction will not raise an exception for @@ -4003,15 +4000,13 @@ modes’ uncacheable accesses. endif::[] ifndef::archi-default,DCacheEn-true[] -[{ohg-config}] Write accesses are not cached. No cache-coherence scheme +[{ohg-config}] Caches are not implemented. No cache-coherence scheme is implemented. - -If a PMA indicates non-cacheability, then accesses to that region must -be satisfied by the memory itself, not by any caches. endif::[] ==== Idempotency PMAs +ifdef::archi-default[] Idempotency PMAs describe whether reads and writes to an address region are idempotent. Main memory regions are assumed to be idempotent. For I/O regions, idempotency on reads and writes can be specified separately @@ -4022,6 +4017,7 @@ write access, then speculative or redundant accesses must be avoided. For the purposes of defining the idempotency PMAs, changes in observed memory ordering created by redundant accesses are not considered a side effect. +endif::[] ifeval::[{note} == true] [NOTE] @@ -4039,6 +4035,7 @@ could cause unexpected side effects. ==== endif::[] +ifdef::archi-default[] For non-idempotent regions, implicit reads and writes must not be performed early or speculatively, with the following exceptions. When a non-speculative implicit read is performed, an implementation is @@ -4053,6 +4050,11 @@ may be used to satisfy subsequent early or speculative implicit reads. The size of these naturally aligned power-of-2 regions is implementation-defined, but, for systems with page-based virtual memory, must not exceed the smallest supported page size. +endif::[] + +ifdef::archi-CVA6[] +[{ohg-config}] All memory accesses are idempotent. +endif::[] [[pmp]] === Physical Memory Protection @@ -4143,11 +4145,11 @@ endif::[] ifdef::archi-default[] [[pmpcfg-rv32]] .RV32 PMP configuration CSR layout. -include::images/bytefield/pmp-rv32.adoc[] +include::images/bytefield/pmp-rv32.edn[] [[pmpcfg-rv64]] .RV64 PMP configuration CSR layout. -include::images/bytefield/pmp-rv64.adoc[] +include::images/bytefield/pmp-rv64.edn[] The PMP address registers are CSRs named `pmpaddr0`-`pmpaddr63`. Each @@ -4175,11 +4177,11 @@ endif::[] ifdef::archi-default[] [[pmpaddr-rv32]] .PMP address register format, RV32. -include::images/bytefield/pmpaddr-rv32.adoc[] +include::images/bytefield/pmpaddr-rv32.edn[] [[pmpaddr-rv64]] .PMP address register format, RV64. -include::images/bytefield/pmpaddr-rv64.adoc[] +include::images/bytefield/pmpaddr-rv64.edn[] endif::[] ifdef::archi-CVA6[] @@ -4192,24 +4194,25 @@ implemented first. All PMP CSR fields are *WARL* and 56 upper entries are read-only zero. PMP CSRs are only accessible to M-mode. [{ohg-config}] The PMP configuration registers are densely packed into CSRs to minimize -context-switch time. For {ohg-config} with sixty four CSRs, `pmpcfg0`–`pmpcfg15`, hold -the configurations as shown +context-switch time. For {ohg-config}, sixteen CSRs, `pmpcfg0`–`pmpcfg15`, hold +the configurations `pmp0cfg`–`pmp63cfg` for the 64 PMP entries, as shown in <>. -The 14 upper entries are read-only zero. +The 14 upper PMP configuration CSRs, `pmpcfg2`-`pmpcfg15`, are read-only zero. [[pmpcfg-rv32]] .RV32 PMP configuration CSR layout. -include::images/bytefield/pmp-rv32.adoc[] +include::images/bytefield/pmp-rv32.edn[] [{ohg-config}] The PMP address registers are CSRs named `pmpaddr0`-`pmpaddr63`. Each PMP address register encodes bits 33-2 of a 34-bit physical address for RV32, as shown in <>. Not all physical address bits may be implemented, and so the `pmpaddr` registers are *WARL*. +The 56 upper PMP address CSRs, `pmpaddr8`-`pmpaddr63`, are read-only zero. [[pmpaddr-rv32]] .PMP address register format, RV32. -include::images/bytefield/pmpaddr-rv32.adoc[] +include::images/bytefield/pmpaddr-rv32.edn[] endif::[] <> shows the layout of a PMP configuration @@ -4220,7 +4223,7 @@ W, and X fields form a collective *WARL* field for which the combinations with R [[pmpcfg]] .PMP configuration register format. -include::images/bytefield/pmpcfg.adoc[] +include::images/bytefield/pmpcfg.edn[] Attempting to fetch an instruction from a PMP region that does not have @@ -4237,7 +4240,6 @@ access-fault exception. The A field in a PMP entry's configuration register encodes the address-matching mode of the associated PMP address register. The encoding of this field is shown in <>. - When A=0, this PMP entry is disabled and matches no addresses. Two other address-matching modes are supported: naturally aligned power-of-2 regions (NAPOT), including the special case of naturally aligned @@ -4341,13 +4343,11 @@ back to NAPOT. endif::[] ifdef::archi-CVA6[] -[{ohg-config}] Although the PMP mechanism supports regions as small as four bytes, -platforms may specify coarser PMP regions. In general, the PMP grain is -latexmath:[$2^{G+2}$] bytes and must be the same across all PMP regions. -When latexmath:[$G \geq 1$] and -latexmath:[${\tt pmpcfg}_i$].A[1] is clear, i.e. the mode is OFF or TOR, -then bits latexmath:[${\tt pmpaddr}_i$][G-1:0] read as all zeros. Bits -latexmath:[${\tt pmpaddr}_i$][G-1:0] do not affect the TOR address-matching +[{ohg-config}] The PMP grain is 8 bytes (latexmath:[$2^{G+2}$] with G = 1) +and must be the same across all PMP regions. +As latexmath:[${\tt pmpcfg}_i$].A[1] is always clear, i.e. the mode is OFF or TOR, +then bit latexmath:[${\tt pmpaddr}_i$][0] read as zero. Bit +latexmath:[${\tt pmpaddr}_i$][0] does not affect the TOR address-matching logic. endif::[] diff --git a/docs/riscv-isa/src/priv-preface.adoc b/docs/riscv-isa/src/priv-preface.adoc index 3ea20a98fc..c971911fb3 100644 --- a/docs/riscv-isa/src/priv-preface.adoc +++ b/docs/riscv-isa/src/priv-preface.adoc @@ -6,24 +6,24 @@ This document describes the RISC-V privileged architecture tailored for OpenHW Group {ohg-config}. -[.big]*_Preface to Version 20240703_* +[.big]*_Preface to Version 20241017_* This document describes the RISC-V privileged architecture. This -release, version 20240703, contains the following versions of the RISC-V ISA +release, version 20241017, contains the following versions of the RISC-V ISA modules: [%autowidth,float="center",align="center",cols="^,<,^",options="header",] |=== |Module |Version |Status -|_Machine ISA_ + +|*Machine ISA* + *Smstateen Extension* + *Smcsrind/Sscsrind Extension* + *Smepmp* + *Smcntrpmf* + *Smrnmi Extension* + *Smcdeleg* + -_Smdbltrp_ + -_Supervisor ISA_ + +*Smdbltrp* + +*Supervisor ISA* + *Svade Extension* + *Svnapot Extension* + *Svpbmt Extension* + @@ -31,20 +31,22 @@ _Supervisor ISA_ + *Svadu Extension* + *Sstc* + *Sscofpmf* + -_Ssdbltrp_ + +*Ssdbltrp* + *Hypervisor ISA* + -_Shlcofideleg_ + +*Shlcofideleg* + *Svvptc* -|_1.13_ + +|*1.13* + +*1.0* + +*1.0* + +*1.0* + *1.0* + *1.0* + *1.0* + *1.0* + +*1.13* + *1.0* + *1.0* + -_1.0_ + -_1.13_ + *1.0* + *1.0* + *1.0* + @@ -52,20 +54,20 @@ _1.13_ + *1.0* + *1.0* + *1.0* + -_1.0_ + *1.0* + -_0.1_ + *1.0* -|_Draft_ + +|*Ratified* + +*Ratified* + +*Ratified* + +*Ratified* + +*Ratified* + *Ratified* + *Ratified* + *Ratified* + *Ratified* + *Ratified* + *Ratified* + -_Draft_ + -_Draft_ + *Ratified* + *Ratified* + *Ratified* + @@ -73,9 +75,7 @@ _Draft_ + *Ratified* + *Ratified* + *Ratified* + -_Draft_ + *Ratified* + -_Draft_ + *Ratified* |=== @@ -100,10 +100,10 @@ implemented. * Defined hardware error and software check exception codes. * Specified synchronization requirements when changing the PBMTE fields in `menvcfg` and `henvcfg`. -* Exposed count-overflow interrups to VS-mode via the Shlcofideleg extension. +* Exposed count-overflow interrupts to VS-mode via the Shlcofideleg extension. * Relaxed behavior of some HINTs when MXLEN > XLEN. -Finally, the following clarifications and document improvments have been made +Finally, the following clarifications and document improvements have been made since the last document release: * Transliterated the document from LaTeX into AsciiDoc. @@ -123,6 +123,7 @@ be set to a nonzero value but sometimes not. * Replaced the concept of vacant memory regions with inaccessible memory or I/O regions. * Clarified that timer and count-overflow interrupts' arrival in interrupt-pending registers is not immediate. +* Clarified that MXR affects only explicit memory accesses. [.big]*_Preface to Version 20211203_* diff --git a/docs/riscv-isa/src/riscv-privileged.adoc b/docs/riscv-isa/src/riscv-privileged.adoc index 6c11635f8d..bc8d53217e 100644 --- a/docs/riscv-isa/src/riscv-privileged.adoc +++ b/docs/riscv-isa/src/riscv-privileged.adoc @@ -4,8 +4,8 @@ include::config.adoc[] = The RISC-V Instruction Set Manual for {ohg-config}: Volume II: Privileged Architecture include::../docs-resources/global-config.adoc[] :description: Volume II - Privileged Architecture -:revnumber: 20240703 -//:revremark: Pre-release version +:revnumber: 20241017 +:revremark: This document is in Ratified state. //development: assume everything can change //stable: assume everything could change //frozen: of you implement this version you assume the risk that something might change because of the public review cycle but we expect little to no change. @@ -15,7 +15,7 @@ include::../docs-resources/global-config.adoc[] :appendix-caption: Appendix :imagesdir: ../docs-resources/images :title-logo-image: image:risc-v_logo.png["RISC-V International Logo",pdfwidth=3.25in,align=center] -:page-background-image: image:draft.png[opacity=20%] +//:page-background-image: image:draft.png[opacity=20%] :title-page-background-image: image:ohg_logo.png[fit=none,pdfwidth=3.25in,position=top] //:title-page-background-image: none //:back-cover-image: image:backpage.png[opacity=25%] @@ -70,11 +70,11 @@ Avižienis, Jacob Bachmeyer, Allen J. Baum, Jonathan Behrens, Paolo Bonzini, Rus Christopher Celio, Chuanhua Chang, David Chisnall, Anthony Coulter, Palmer Dabbelt, Monte Dalrymple, Paul Donahue, Greg Favor, Dennis Ferguson, Marc Gauthier, Andy Glew, Gary Guo, Mike Frysinger, John Hauser, David Horner, Olof -Johansson, David Kruckemyer, Yunsup Lee, Daniel Lustig, Andrew Lutomirski, Prashanth Mundkur, +Johansson, David Kruckemyer, Yunsup Lee, Daniel Lustig, Andrew Lutomirski, Martin Maas, Prashanth Mundkur, Jonathan Neuschäfer, Rishiyur Nikhil, Stefan O'Rear, Albert Ou, John Ousterhout, David Patterson, Dmitri Pavlov, Kade Phillips, Josh Scheid, Colin Schmidt, Michael Taylor, Wesley Terpstra, Matt Thomas, Tommy Thorn, Ray -VanDeWalker, Megan Wachs, Steve Wallach, Andrew Waterman, Claire Wolf, +VanDeWalker, Megan Wachs, Steve Wallach, Andrew Waterman, Claire Wolf, Adam Zabrocki, and Reinoud Zandijk.._ _This document is released under a Creative Commons Attribution 4.0 International License._ @@ -106,6 +106,7 @@ include::sscofpmf.adoc[] include::hypervisor.adoc[] include::priv-cfi.adoc[] include::ssdbltrp.adoc[] +include::zpm.adoc[] include::priv-insns.adoc[] include::priv-history.adoc[] include::bibliography.adoc[] diff --git a/docs/riscv-isa/src/riscv-unprivileged.adoc b/docs/riscv-isa/src/riscv-unprivileged.adoc index aa7c082c2b..c9c5bb8861 100644 --- a/docs/riscv-isa/src/riscv-unprivileged.adoc +++ b/docs/riscv-isa/src/riscv-unprivileged.adoc @@ -4,7 +4,7 @@ include::config.adoc[] = The RISC-V Instruction Set Manual for {ohg-config}: Volume I - Unprivileged Architecture include::../docs-resources/global-config.adoc[] :description: Unprivileged Architecture -:revnumber: 20240703 +:revnumber: 20241017 //:revremark: Pre-release version :colophon: :preface-title: Preamble @@ -30,6 +30,7 @@ include::../docs-resources/global-config.adoc[] :example-caption: Example :listing-caption: Listing :sectnums: +:sectnumlevels: 5 :toc: left :toclevels: 5 :source-highlighter: pygments @@ -197,7 +198,6 @@ include::zfinx.adoc[] include::c-st-ext.adoc[] include::zc.adoc[] include::b-st-ext.adoc[] -include::j-st-ext.adoc[] include::p-st-ext.adoc[] include::v-st-ext.adoc[] include::scalar-crypto.adoc[] diff --git a/docs/riscv-isa/src/rnmi.adoc b/docs/riscv-isa/src/rnmi.adoc index 7bfa48beb4..5d9db31dad 100644 --- a/docs/riscv-isa/src/rnmi.adoc +++ b/docs/riscv-isa/src/rnmi.adoc @@ -1,5 +1,5 @@ [[rnmi]] -== "Smrnmi" Extension for Resumable Non-Maskable Interrupts, Version 0.5 +== "Smrnmi" Extension for Resumable Non-Maskable Interrupts, Version 1.0 ifeval::[{RVZsmrnmi} == false] {ohg-config}: This extension is not supported. diff --git a/docs/riscv-isa/src/rv64.adoc b/docs/riscv-isa/src/rv64.adoc index 05a4e3bf90..38c52e66c7 100644 --- a/docs/riscv-isa/src/rv64.adoc +++ b/docs/riscv-isa/src/rv64.adoc @@ -46,7 +46,7 @@ endif::[] ==== Integer Register-Immediate Instructions -include::images/wavedrom/rv64i-base-int.adoc[] +include::images/wavedrom/rv64i-base-int.edn[] [[rv64i-base-int]] //.RV64I register-immediate instructions @@ -57,7 +57,7 @@ immediate to register _rs1_ and produces the proper sign extension of a writes the sign extension of the lower 32 bits of register _rs1_ into register _rd_ (assembler pseudoinstruction SEXT.W). -include::images/wavedrom/rv64i-slli.adoc[] +include::images/wavedrom/rv64i-slli.edn[] [[rv64i-slli]] //.RV64I register-immediate (descr ADDIW) instructions @@ -74,7 +74,7 @@ copied into the vacated upper bits). (((RV64I, SRLIW))) (((RV64I, RV64I-only))) -include::images/wavedrom/rv64i-slliw.adoc[] +include::images/wavedrom/rv64i-slliw.edn[] [[rv64i-slliw]] SLLIW, SRLIW, and SRAIW are RV64I-only instructions that are analogously @@ -91,7 +91,7 @@ are marked as reserved. This is a backwards-compatible change. ==== endif::[] -include::images/wavedrom/rv64_lui-auipc.adoc[] +include::images/wavedrom/rv64-lui-auipc.edn[] [[rv64_lui-auipc]] //.RV64I register-immediate (descr) instructions @@ -119,7 +119,7 @@ endif::[] ==== Integer Register-Register Operations //this diagramdoesn't match the tex specification -include::images/wavedrom/rv64i_int-reg-reg.adoc[] +include::images/wavedrom/rv64i-int-reg-reg.edn[] [[int_reg-reg]] //.RV64I integer register-register instructions @@ -147,7 +147,7 @@ results to 64 bits. The shift amount is given by _rs2[4:0]_. RV64I extends the address space to 64 bits. The execution environment will define what portions of the address space are legal to access. -include::images/wavedrom/load_store.adoc[] +include::images/wavedrom/load-store.edn[] [[load_store]] //.Load and store instructions diff --git a/docs/riscv-isa/src/supervisor.adoc b/docs/riscv-isa/src/supervisor.adoc index e72fce43dd..697cbf541f 100644 --- a/docs/riscv-isa/src/supervisor.adoc +++ b/docs/riscv-isa/src/supervisor.adoc @@ -794,7 +794,7 @@ ifndef::archi-default,MTvalEn-true[] [{ohg-config}] The `stval` register is an MXLEN-bit read-only 0 register. endif::[] - +[[sec:senvcfg]] ==== Supervisor Environment Configuration (`senvcfg`) Register The `senvcfg` CSR is an SXLEN-bit read/write register, formatted as @@ -908,17 +908,12 @@ enabled. endif::[] ifdef::archi-default[] -The definition of the CBZE field will be furnished by the forthcoming -Zicboz extension. Its allocation within `senvcfg` may change prior to -the ratification of that extension. +The definition of the CBZE field is furnished by the Zicboz extension. -The definitions of the CBCFE and CBIE fields will be furnished by the -forthcoming Zicbom extension. Their allocations within `senvcfg` may -change prior to the ratification of that extension. +The definitions of the CBCFE and CBIE fields are furnished by the Zicbom +extension. -The definition of the PMM field will be furnished by the forthcoming -Ssnpm extension. Its allocation within `senvcfg` may change prior to the -ratification of that extension. +The definition of the PMM field is furnished by the Ssnpm extension. The Zicfilp extension adds the `LPE` field in `senvcfg`. When the `LPE` field is set to 1, the Zicfilp extension is enabled in VU/U-mode. When the `LPE` field is @@ -1273,13 +1268,6 @@ integer register _rs2_, except for entries containing global mappings. If the value held in _rs1_ is not a valid virtual address, then the SFENCE.VMA instruction has no effect. No exception is raised in this case. - -When __rs2__≠``x0``, bits SXLEN-1:ASIDMAX of the value held -in _rs2_ are reserved for future standard use. Until their use is -defined by a standard extension, they should be zeroed by software and -ignored by current implementations. Furthermore, if -ASIDLEN RuleGroup=Blackbox Resolution ############### +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -Severity Rule Name Count Short Help +Severity Rule Name Count Short Help =============================================================================== -WARNING WarnAnalyzeBBox 1 Reports black boxes in the design with - Warn severity. +WARNING WarnAnalyzeBBox 1 Reports black boxes in the design with + Warn severity. +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ ############### BuiltIn -> RuleGroup=Command-line read ############### +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -Severity Rule Name Count Short Help +Severity Rule Name Count Short Help =============================================================================== -INFO HdlLibDuCheck_03 1 Reports that 'hdllibdu' is not required - if no precompiled design unit is used - in current run. +INFO HdlLibDuCheck_03 1 Reports that 'hdllibdu' is not required + if no precompiled design unit is used + in current run. +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ ############### BuiltIn -> RuleGroup=Design Read ############### +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -Severity Rule Name Count Short Help +Severity Rule Name Count Short Help =============================================================================== -WARNING SYNTH_12605 4 Used Priority/Unique Type case/if - statement but all the conditions are - not covered -WARNING SYNTH_12608 1 The logic of the always block - mismatches with the type of Always - Block -WARNING SYNTH_12611 2 Property blocks will be ignored for - synthesis -WARNING SYNTH_5064 37 Non-synthesizable statements are - ignored for synthesis. -WARNING SYNTH_5143 11 Initial block is ignored for synthesis -WARNING SYNTH_89 4 Initial Assignment at Declaration is - ignored by synthesis. -WARNING WRN_1024 3 Signed argument is passed to $signed - system function call, or unsigned - argument passed to $unsigned system - function call. -INFO DetectTopDesignUnits 1 Identify the top-level design units in - user design. -INFO ElabSummary 1 Generates Elaborated design units - Summary data +WARNING SYNTH_12605 5 Used Priority/Unique Type case/if + statement but all the conditions are + not covered +WARNING SYNTH_12608 1 The logic of the always block + mismatches with the type of Always + Block +WARNING SYNTH_12611 2 Property blocks will be ignored for + synthesis +WARNING SYNTH_5064 38 Non-synthesizable statements are + ignored for synthesis. +WARNING SYNTH_5143 11 Initial block is ignored for synthesis +WARNING SYNTH_89 4 Initial Assignment at Declaration is + ignored by synthesis. +WARNING WRN_27 1 Bit-select should not be out-of-range. +INFO DetectTopDesignUnits 1 Identify the top-level design units in + user design. +INFO ElabSummary 1 Generates Elaborated design units + Summary data +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ ############### Non-BuiltIn -> Goal=lint/lint_rtl ############### +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ -Severity Rule Name Count Short Help +Severity Rule Name Count Short Help =============================================================================== -ERROR InferLatch 2 Latch inferred -ERROR UndrivenInTerm-ML 4 Undriven but loaded input terminal of - an instance detected -ERROR W123 18 A signal or variable has been read but - is not set -ERROR W416 1 Width of return type and return value - of a function should be same (Verilog) - Range of return type and return value - of a function should be same (VHDL) -WARNING FlopEConst 19 Flip-flop enable pin is permanently - disabled or enabled -WARNING ParamWidthMismatch-ML 1 Parameter width does not match with the - value assigned -WARNING STARC05-1.3.1.3 1 Asynchronous reset/preset signals must - not be used as non-reset/preset or - synchronous reset/preset signals -WARNING STARC05-2.1.3.1 2 Bit-width of function arguments must - match bit-width of the corresponding - function inputs. -WARNING STARC05-2.1.4.5 1 Bit-wise operators must be used instead - of logic operators in multi-bit - operations. -WARNING STARC05-2.1.5.3 1 Conditional expressions should evaluate - to a scalar. -WARNING STARC05-2.2.3.3 14 Do not assign over the same signal in - an always construct for sequential - circuits -WARNING W224 1 Multi-bit expression found when one-bit - expression expected -WARNING W240 297 An input has been declared but is not - read -WARNING W263 4 A case expression width does not match - case select expression width -WARNING W287b 32 Output port of an instance is not - connected -WARNING W415a 536 Signal may be multiply assigned (beside - initialization) in the same scope. -WARNING W480 3 Loop index is not of type integer -WARNING W486 2 Shift overflow - some bits may be lost -WARNING W528 481 A signal or variable is set but never - read -INFO W240 1 An input has been declared but is not - read -INFO W528 1 A signal or variable is set but never - read +ERROR InferLatch 2 Latch inferred +ERROR UndrivenInTerm-ML 1 Undriven but loaded input terminal of + an instance detected +ERROR W123 11 A signal or variable has been read but + is not set +ERROR W416 1 Width of return type and return value + of a function should be same (Verilog) + Range of return type and return value + of a function should be same (VHDL) +WARNING FlopEConst 19 Flip-flop enable pin is permanently + disabled or enabled +WARNING ParamWidthMismatch-ML 1 Parameter width does not match with the + value assigned +WARNING STARC05-1.3.1.3 1 Asynchronous reset/preset signals must + not be used as non-reset/preset or + synchronous reset/preset signals +WARNING STARC05-2.1.3.1 2 Bit-width of function arguments must + match bit-width of the corresponding + function inputs. +WARNING STARC05-2.1.4.5 1 Bit-wise operators must be used instead + of logic operators in multi-bit + operations. +WARNING STARC05-2.1.5.3 2 Conditional expressions should evaluate + to a scalar. +WARNING STARC05-2.2.3.3 14 Do not assign over the same signal in + an always construct for sequential + circuits +WARNING W224 2 Multi-bit expression found when one-bit + expression expected +WARNING W263 4 A case expression width does not match + case select expression width +WARNING W287b 36 Output port of an instance is not + connected +WARNING W415a 45 Signal may be multiply assigned (beside + initialization) in the same scope. +WARNING W480 3 Loop index is not of type integer +WARNING W486 2 Shift overflow - some bits may be lost +WARNING W528 482 A signal or variable is set but never + read +INFO W528 1 A signal or variable is set but never + read +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ diff --git a/spyglass/sg_setup/cva6/cva6_goals_setup.tcl b/spyglass/sg_setup/cva6/cva6_goals_setup.tcl index e3231d6e76..331167745a 100755 --- a/spyglass/sg_setup/cva6/cva6_goals_setup.tcl +++ b/spyglass/sg_setup/cva6/cva6_goals_setup.tcl @@ -18,6 +18,11 @@ # read_file -type waiver ######################################################################################################## +# ignore multiple assignment bitwise or violations (W415a) in for loop +set_parameter ignore_bitwiseor_assignment yes +# ignore multiple assignment violations (W415a) in if/else or case +set_parameter ignore_if_case_statement yes + ## Goal:cdc/cdc_verify_struct ################## current_goal cdc/cdc_verify_struct set_goal_option report {count moresimple moresimple_sevclass sign_off summary waiver CKSGDCInfo Clock-Reset-Summary CDC-report Ac_sync_group_detail Glitch_detailed CrossingInfo SynchInfo Clock-Reset-Detail} diff --git a/spyglass/sg_setup/cva6/cva6_waiver.awl b/spyglass/sg_setup/cva6/cva6_waiver.awl index 2445e09b99..44ca186667 100644 --- a/spyglass/sg_setup/cva6/cva6_waiver.awl +++ b/spyglass/sg_setup/cva6/cva6_waiver.awl @@ -11,3 +11,4 @@ waive -file_line {$CVA6_REPO_DIR/common/local/util/sram_cache.sv} {85} -severi waive -file { {$CVA6_REPO_DIR/vendor/pulp-platform/tech_cells_generic/src/rtl/tc_sram.sv} } -severity { {ERROR} } -rule { {ErrorAnalyzeBBox} } waive -file { {$CVA6_REPO_DIR/vendor/pulp-platform/tech_cells_generic/src/rtl/tc_sram.sv} } -severity { {ERROR} } -rule { {SYNTH_5251} } waive -file { {$CVA6_REPO_DIR/common/local/util/tc_sram_wrapper_cache_techno.sv} } -du { {tc_sram_wrapper_cache_techno} } -severity { {ERROR} } -rule { {ErrorAnalyzeBBox} } +waive -rule { {W240} } -comment {Created by akassimi on 26-Jul-2024 18:36:59} diff --git a/util/toolchain-builder/README.md b/util/toolchain-builder/README.md index 2a1bf35d00..f6b56e5dcb 100644 --- a/util/toolchain-builder/README.md +++ b/util/toolchain-builder/README.md @@ -48,10 +48,10 @@ upstream toolchain (default: GCC 13.1.0) for bare-metal 32-bit and 64-bit applic INSTALL_DIR=$RISCV # 2. Fetch the source code of the toolchain (assumes Internet access.) - sh get-toolchain.sh + bash get-toolchain.sh # 3. Build and install the toolchain (requires write+create permissions for $INSTALL_DIR.) - sh build-toolchain.sh $INSTALL_DIR + bash build-toolchain.sh $INSTALL_DIR ## File and directory structure @@ -93,9 +93,9 @@ missing directories of the installation location._ Once a configuration name `CONFIG_NAME` and an installation location `INSTALL_DIR` are chosen, use - sh get-toolchain.sh CONFIG_NAME + bash get-toolchain.sh CONFIG_NAME # E.g., - # sh get-toolchain.sh gcc-13.1.0-baremetal + # bash get-toolchain.sh gcc-13.1.0-baremetal to fetch/update the source code and to check out the matching baseline of code. @@ -109,9 +109,9 @@ will be selected implicitly. _The default configuration is currently named To build the toolchain from the retrieved source baseline, use - sh build-toolchain.sh CONFIG_NAME INSTALL_DIR + bash build-toolchain.sh CONFIG_NAME INSTALL_DIR # E.g., - # sh build-toolchain.sh gcc-13.1.0-baremetal $RISCV + # bash build-toolchain.sh gcc-13.1.0-baremetal $RISCV To speedup the building it is recommended to set the number of threads to use @@ -126,9 +126,9 @@ code such as a change of baseline configuration. _Whenever the source configuration is changed, please use the `-f` (or `--force`) option to forcibly rebuild the entire toolchain_: - sh build-toolchain.sh -f CONFIG_NAME INSTALL_DIR + bash build-toolchain.sh -f CONFIG_NAME INSTALL_DIR # E.g., - # sh build-toolchain.sh -f gcc-13.1.0-baremetal $RISCV + # bash build-toolchain.sh -f gcc-13.1.0-baremetal $RISCV ## Defining new configurations diff --git a/vendor/pulp-platform/fpga-support/rtl/SyncDpRam_ind_r_w.sv b/vendor/pulp-platform/fpga-support/rtl/SyncDpRam_ind_r_w.sv new file mode 100644 index 0000000000..afddbcc7e4 --- /dev/null +++ b/vendor/pulp-platform/fpga-support/rtl/SyncDpRam_ind_r_w.sv @@ -0,0 +1,59 @@ +// Copyright 2024 PlanV Technologies +// +// Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0 +// You may obtain a copy of the License at https://solderpad.org/licenses +// +// Inferable, Synchronous Dual-Port RAM, there are a write port and a read port fully independent +// +// +// This module is designed to work with both Xilinx, Microchip and Altera FPGA tools by following the respective +// guidelines: +// - Xilinx UG901 Vivado Design Suite User Guide: Synthesis +// - Inferring Microchip PolarFire RAM Blocks +// - Altera Quartus II Handbook Volume 1: Design and Synthesis (p. 768) +// +// Current Maintainers:: Angela Gonzalez - PlanV Technologies + +module SyncDpRam_ind_r_w +#( + parameter ADDR_WIDTH = 10, + parameter DATA_DEPTH = 1024, // usually 2**ADDR_WIDTH, but can be lower + parameter DATA_WIDTH = 32 +)( + input logic Clk_CI, + + // Write port + input logic WrEn_SI, + input logic [ADDR_WIDTH-1:0] WrAddr_DI, + input logic [DATA_WIDTH-1:0] WrData_DI, + + // Read port + input logic [ADDR_WIDTH-1:0] RdAddr_DI, + output logic [DATA_WIDTH-1:0] RdData_DO +); + +// logic [DATA_WIDTH-1:0] mem [DATA_DEPTH-1:0]= '{default:0}; +(* ramstyle = "mlab" *) logic [DATA_WIDTH-1:0] mem [DATA_DEPTH-1:0]= '{default:0}; + + // WRITE + always_ff @(posedge Clk_CI) + begin + if (WrEn_SI) begin + mem[WrAddr_DI] <= WrData_DI; + end + RdData_DO = mem[RdAddr_DI]; + end + + //////////////////////////// + // assertions + //////////////////////////// + + // pragma translate_off + assert property + (@(posedge Clk_CI) (longint'(2)**longint'(ADDR_WIDTH) >= longint'(DATA_DEPTH))) + else $error("depth out of bounds"); + // pragma translate_on + +endmodule \ No newline at end of file diff --git a/vendor/pulp-platform/fpga-support/rtl/SyncThreePortRam.sv b/vendor/pulp-platform/fpga-support/rtl/SyncThreePortRam.sv new file mode 100644 index 0000000000..2b65c9ec80 --- /dev/null +++ b/vendor/pulp-platform/fpga-support/rtl/SyncThreePortRam.sv @@ -0,0 +1,65 @@ +// Copyright 2024 PlanV Technologies +// +// Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0 +// You may obtain a copy of the License at https://solderpad.org/licenses +// +// Inferable, Asynchronous Three-Ports RAM, there are a write port and two read ports +// +// +// This module is designed to work with both Xilinx, Microchip and Altera FPGA tools by following the respective +// guidelines: +// - Xilinx UG901 Vivado Design Suite User Guide: Synthesis +// - Inferring Microchip PolarFire RAM Blocks +// - Altera Quartus II Handbook Volume 1: Design and Synthesis (p. 768) +// +// Current Maintainers:: Angela Gonzalez - PlanV Technologies + + +module SyncThreePortRam +#( + parameter ADDR_WIDTH = 10, + parameter DATA_DEPTH = 1024, // usually 2**ADDR_WIDTH, but can be lower + parameter DATA_WIDTH = 32 +)( + input logic Clk_CI, + + // Write port + input logic WrEn_SI, + input logic [ADDR_WIDTH-1:0] WrAddr_DI, + input logic [DATA_WIDTH-1:0] WrData_DI, + + // Read ports + input logic [ADDR_WIDTH-1:0] RdAddr_DI_0, + input logic [ADDR_WIDTH-1:0] RdAddr_DI_1, + + output logic [DATA_WIDTH-1:0] RdData_DO_0, + output logic [DATA_WIDTH-1:0] RdData_DO_1 +); + +logic [DATA_WIDTH-1:0] mem [DATA_DEPTH-1:0]= '{default:0}; + + // WRITE + always_ff @(posedge Clk_CI) + begin + if (WrEn_SI) begin + mem[WrAddr_DI] <= WrData_DI; + end + + RdData_DO_0 = mem[RdAddr_DI_0]; + RdData_DO_1 = mem[RdAddr_DI_1]; + + end + + //////////////////////////// + // assertions + //////////////////////////// + + // pragma translate_off + assert property + (@(posedge Clk_CI) (longint'(2)**longint'(ADDR_WIDTH) >= longint'(DATA_DEPTH))) + else $error("depth out of bounds"); + // pragma translate_on + +endmodule diff --git a/vendor/riscv/riscv-config/riscv_config/schemas/schema_platform.yaml b/vendor/riscv/riscv-config/riscv_config/schemas/schema_platform.yaml index 3e7dddf623..c602e24d26 100644 --- a/vendor/riscv/riscv-config/riscv_config/schemas/schema_platform.yaml +++ b/vendor/riscv/riscv-config/riscv_config/schemas/schema_platform.yaml @@ -326,3 +326,69 @@ zicbo_cache_block_sz: check_with: cache_block_size default: implemented: False + +### +#memory_map +#-------- +# +# - **Description**: Memory map - list of memory regions with their attributes +# - **Constraints**: +# +# - All addresses must be non-negative (0 or higher) +# - All sizes must be non-negative (0 or higher) +# +# - **Examples**: +# +# .. code-block:: yaml +# +# memory_map: +# - memory_region: +# name: bootrom +# base_addr: 0x10000 +# size: 0x1000 +# description: System boot ROM +# attributes: +# read_only: True +# - memory_region: +# name: dram +# base_addr: 0x80000000 +# size: 0x40000000 +# description: Main memory + +memory_map: + type: list + schema: + type: dict + schema: + memory_region: + type: dict + schema: + description: + type: string + default: A homogeneous memory region + name: + type: string + required: True + base_addr: + type: integer + min: 0 + required: True + size: + type: integer + min: 0 + required: True + attributes: + type: dict + schema: + executable: + type: boolean + cached: + type: boolean + non_idempotent: + type: boolean + read_only: + type: boolean + default: { executable: True, cached: True, non_idempotent: False, read_only: False } + required: True + required: True + required: True diff --git a/verif/core-v-verif b/verif/core-v-verif index 2d9f96e513..bfbbd19c13 160000 --- a/verif/core-v-verif +++ b/verif/core-v-verif @@ -1 +1 @@ -Subproject commit 2d9f96e513a4004b2536fe4062e1e2dd7665464d +Subproject commit bfbbd19c13aaf24b724eb32e5b5ac7b0cb951c54 diff --git a/verif/docs/Protocols/Makefile b/verif/docs/Protocols/Makefile new file mode 100644 index 0000000000..1cd221ea49 --- /dev/null +++ b/verif/docs/Protocols/Makefile @@ -0,0 +1,28 @@ +# Copyright 2024 Thales DIS France SAS +# +# Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0 +# You may obtain a copy of the License at https://solderpad.org/licenses/ +# +# Original Author: Zbigniew CHAMSKI - Thales + +FIGDIR=figures +WAVEDIR=wavedrom + +# Names of figure files, without directory prefix nor .svg suffix +FIGURES=interrupt-ack-uvm + +SVG_FILES=$(patsubst %,$(FIGDIR)/%.svg,$(FIGURES)) +WAVE_FILES=$(patsubst %,$(WAVEDIR)/%.wave,$(FIGURES)) + +all: $(SVG_FILES) + +# wavedrom-cli requires a local installation and needs nodejs version >= 14. +# See https://github.com/wavedrom/cli. +$(FIGDIR)/%.svg: $(WAVEDIR)/%.wave + wavedrom-cli -i $^ -s $@ + +clean: + $(RM) $(SVG_FILES) + diff --git a/verif/docs/Protocols/figures/interrupt-ack-uvm.svg b/verif/docs/Protocols/figures/interrupt-ack-uvm.svg new file mode 100644 index 0000000000..b081d5eee3 --- /dev/null +++ b/verif/docs/Protocols/figures/interrupt-ack-uvm.svg @@ -0,0 +1 @@ +clkint #0int #nbit[0]bit[n]execnormalhandlernormal@int_ack[0]@int_ack[n]>0>0ACKrcswzint activePlatformmiphartmem \ No newline at end of file diff --git a/verif/docs/Protocols/interrupt-verification.adoc b/verif/docs/Protocols/interrupt-verification.adoc new file mode 100644 index 0000000000..4a14ef477e --- /dev/null +++ b/verif/docs/Protocols/interrupt-verification.adoc @@ -0,0 +1,62 @@ +//// +Copyright 2024 Thales DIS France SAS + +Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0 +You may obtain a copy of the License at https://solderpad.org/licenses/ + +Original Author: Zbigniew CHAMSKI - Thales +//// + += Interrupt Verification protocol + +== Scope + +In the rest of this section it is assumed that the design-under-test is limited to a hart (or 'core') with its internal CSRs. The hart supports only Machine mode, explicitly excluding User and Supervisor modes as not implemented. + +Arbitration between multiple sources of external interrupts is under the responsibility of the "platform" and thus performed outside of the hart. + +Therefore, the verification of interrupt handling at hart level covers two co-ordinated aspects: + +* the detection of "interrupt present" state for all implemented interrupt types among those listed in MIP/MIE; +* the generation of the response to the "interrupt present" state according to the fixed interrupt priority (see https://cva6.readthedocs.io/en/latest/04_cv32a65x/riscv/priv.html#_machine_interrupt_mip_and_mie_registers[Machine Interrupt Registers, CV32A65X Privileged ISA spec, section 3.1.9]). + +== Interfaces + +The interface for raising interrupts (from the platform to the hart) is a set of bits in the `MIP` (Machine Interrupt Present) CSR. These bits are controlled by the verification platform by driving hardware inputs of the hart. +The interface for clearing pending interrupts (from the hart to the platform) can either rely on writable "interrupt pending" bits in `MIP`, or "the implementation must provide some other mechanism for clearing the pending interrupt." + +On CV32A65X the "interrupt pending" bits in `MIP` are read-only and therefore, they cannot be used to acknowledge (clear) a pending interrupt. Hence, for the purpose of verification we introduce a 64-bit memory-mapped register `int_ack`. To acknowledge a pending interrupt represented by bit N in the `MIP` register, the hart performs a memory store at the address of `int_ack` with bit N set in the value being stored. All stores into `int_ack` shall be monitored by the platform and shall eventually result in clearing a pending interrupt if both the corresponding "interrupt pending" bit in MIP is set and the value written into `int_ack` has a bit set at the same bit position. The clearing of the corresponding pending interrupt by the platform may be immediate or delayed. + +The `int_ack` memory location has no fixed address. + +== The protocol + +The basic operation of the interrupt verification protocol is shown in xref:fig-basic-raise-clear-protocol[xrefstyle=short]. + +.Basic interrupt raise-acknowledge protocol +[#fig-basic-raise-clear-protocol] +image::figures/interrupt-ack-uvm.svg[Basic interrupt verification protocol,800,opts=inline] + +The platform/testbench notifies the hart of raising an interrupt by setting the corresponding hardware input of the hart. This results in the hart setting a bit at position 'N' in the `mip` CSR according to the hardware input being raised (marker `r` for "raise"). +The hart detects the presence of a pending interrupt at position N in `mip`, and if that interrupt is enabled it starts executing the interrupt handler. + +While executing the handler, the hart performs a memory store to the symbolic location `int_ack` with bit N set to acknowledge the servicing of interrupt at position N in `mip` (markers `s` for "store" and `w` for "write"). + +The platform responds by clearing the interrupt (marker `c` for "clear"): it clears bit N in `mip` *and* performs a memory store that clears bit N in `int_ack` (marker `z` for "zero the ACK"). Both operations shall occur in finite time but quickly enough to clear the interrupt pending bit in `mip` before the hart enables interrupts at the end of the interrupt handler. + +== Software implementation + +Test programs used for interrupt verification shall reserve the necessary memory storage located at symbol `int_ack` and shall make the value of that symbol available to verification software. +To reserve the storage, assembly program shall use storage type `.dword`. C/C++ programs shall use scalar type `unsigned long int` with `volatile` qualifier to prevent over-optimization of assignment operations. + +The symbol shall be defined in section `.uvmif`. The linker scripts shall ensure that this section is aligned on a 64-byte multiple to reduce cache latency artefacts where applicable. + +== Rationale + +. The storage allocated for `int_ack` is 64 bits wide to support a single test program source for XLEN=32 and XLEN=64. +. The symbol `int_ack` has no fixed address to avoid hardcoded dependencies in the verification flow. +. In order to keep `int_ack` isolated from other symbols occurring in verification test programs, the symbol is placed in a dedicated section. +. The interrupt acknowledge mechanism abstracts from the Machine Timer Interrupt logic (cf. https://cva6.readthedocs.io/en/latest/04_cv32a65x/riscv/priv.html#_machine_timer_mtime_and_mtimecmp_registers[Machine Timer Registers, CV32A65X Privileged ISA spec, section 3.2.1]). + In particular, it relieves the verification platform from maintaining consistent `MTIME` and `MTIMECMP` values. diff --git a/verif/docs/Protocols/wavedrom/interrupt-ack-uvm.wave b/verif/docs/Protocols/wavedrom/interrupt-ack-uvm.wave new file mode 100644 index 0000000000..a842979eb4 --- /dev/null +++ b/verif/docs/Protocols/wavedrom/interrupt-ack-uvm.wave @@ -0,0 +1,39 @@ +// Copyright 2024 Thales DIS France SAS +// +// Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0 +// You may obtain a copy of the License at https://solderpad.org/licenses/ +// +// Original Author: Zbigniew CHAMSKI - Thales + +{signal: [ + { name: 'clk', wave: 'p..|..|...'}, + ['Platform', + ['int active', + {name: 'int #0', wave: '0..|..|...'}, + {name: 'int #n', wave: '01.|..|0..', + node: '.r.....c..' }, + ] + ], + {}, + ['hart', + ['mip', + {name: 'bit[0]', wave: 'x..|......'}, + {name: 'bit[n]', wave: '01.|..|0..', + node: '..........'}, + ], + {name: 'exec', wave: '7.4|..|.7.', data: 'normal handler normal', + node: '....s.....'}, + ], + { node: '.RE....ZX.'}, + {}, + ['mem', + {name: '@int_ack[0]', wave: '...|..|...'}, + {name: '@int_ack[n]', wave: '0..|.1|0..', + node: '.....w.z..'} + ] +], + edge: + ['R+E >0', 'Z+X >0', 's|->w ACK', 'r|R', 'c|z'], +} diff --git a/verif/docs/VerifPlans/csr_access/VP_IP000.yml b/verif/docs/VerifPlans/csr_access/VP_IP000.yml index 664f24fee5..a62965d9d2 100644 --- a/verif/docs/VerifPlans/csr_access/VP_IP000.yml +++ b/verif/docs/VerifPlans/csr_access/VP_IP000.yml @@ -17,7 +17,7 @@ subfeatures: !!omap description: Upon reset, RISC-V CVA6 Machine mode RW CSRs must initialize to their respective POR value. reqt_doc: - https://docs.openhwgroup.org/projects/cva6-user-manual/01_cva6_user/CV32A6_Control_Status_Registers.html + https://docs.openhwgroup.org/projects/cva6-user-manual/01_cva6_user/CSR_CV32A60X.html ref_mode: page ref_page: '' ref_section: '' @@ -68,7 +68,7 @@ subfeatures: !!omap random values like 0xa5a5a5a5, 0x5a5a5a5a, 0xffa1ae40.. and read using the CSR instructions defined in the instruction set architecture (ISA). reqt_doc: - https://docs.openhwgroup.org/projects/cva6-user-manual/01_cva6_user/CV32A6_Control_Status_Registers.html + https://docs.openhwgroup.org/projects/cva6-user-manual/01_cva6_user/CSR_CV32A60X.html ref_mode: page ref_page: '' ref_section: '' @@ -94,7 +94,7 @@ subfeatures: !!omap description: Accessing RISC-V CVA6 Machine Mode CSRs in different privilege modes (User, Supervisor and Machine modes). reqt_doc: - https://docs.openhwgroup.org/projects/cva6-user-manual/01_cva6_user/CV32A6_Control_Status_Registers.html + https://docs.openhwgroup.org/projects/cva6-user-manual/01_cva6_user/CSR_CV32A60X.html ref_mode: page ref_page: '' ref_section: '' diff --git a/verif/docs/VerifPlans/csr_access/VP_IP001.yml b/verif/docs/VerifPlans/csr_access/VP_IP001.yml index 4e72a8156c..8b16419697 100644 --- a/verif/docs/VerifPlans/csr_access/VP_IP001.yml +++ b/verif/docs/VerifPlans/csr_access/VP_IP001.yml @@ -16,7 +16,7 @@ subfeatures: !!omap description: Upon reset,RISC-V CVA6 Machine RO(read only) CSR must initialize to their respective POR value. reqt_doc: - https://docs.openhwgroup.org/projects/cva6-user-manual/01_cva6_user/CV32A6_Control_Status_Registers.html + https://docs.openhwgroup.org/projects/cva6-user-manual/01_cva6_user/CSR_CV32A60X.html ref_mode: page ref_page: '' ref_section: '' @@ -42,7 +42,7 @@ subfeatures: !!omap random values like 0xa5a5a5a5, 0x5a5a5a5a, 0xffa1ae40.. and confirm whether write into RO CSRs is possible or not. reqt_doc: - https://docs.openhwgroup.org/projects/cva6-user-manual/01_cva6_user/CV32A6_Control_Status_Registers.html + https://docs.openhwgroup.org/projects/cva6-user-manual/01_cva6_user/CSR_CV32A60X.html ref_mode: page ref_page: '' ref_section: '' @@ -68,7 +68,7 @@ subfeatures: !!omap description: Accessing RISC-V Machine read only CSRs in different privilege modes (User, Supervisor and Machine modes). reqt_doc: - https://docs.openhwgroup.org/projects/cva6-user-manual/01_cva6_user/CV32A6_Control_Status_Registers.html + https://docs.openhwgroup.org/projects/cva6-user-manual/01_cva6_user/CSR_CV32A60X.html ref_mode: page ref_page: '' ref_section: '' diff --git a/verif/docs/VerifPlans/csr_access/VP_IP002.yml b/verif/docs/VerifPlans/csr_access/VP_IP002.yml index c615ae16e4..51ed9fa8d0 100644 --- a/verif/docs/VerifPlans/csr_access/VP_IP002.yml +++ b/verif/docs/VerifPlans/csr_access/VP_IP002.yml @@ -17,7 +17,7 @@ subfeatures: !!omap description: Upon reset, RISC-V CVA6 Supervisor mode RW CSRs must initialize to their respective POR value. reqt_doc: - https://docs.openhwgroup.org/projects/cva6-user-manual/01_cva6_user/CV32A6_Control_Status_Registers.html + https://docs.openhwgroup.org/projects/cva6-user-manual/01_cva6_user/CSR_CV32A60X.html ref_mode: page ref_page: '' ref_section: '' @@ -67,7 +67,7 @@ subfeatures: !!omap writing random values like 0xa5a5a5a5, 0x5a5a5a5a, 0xffa1ae40.. and read using the CSR instructions defined in the instruction set architecture (ISA). reqt_doc: - https://docs.openhwgroup.org/projects/cva6-user-manual/01_cva6_user/CV32A6_Control_Status_Registers.html + https://docs.openhwgroup.org/projects/cva6-user-manual/01_cva6_user/CSR_CV32A60X.html ref_mode: page ref_page: '' ref_section: '' @@ -93,7 +93,7 @@ subfeatures: !!omap description: Accessing RISC-V CVA6 Supervisor Mode CSRs in different privilege modes (User,Supervisor and Machine modes). reqt_doc: - https://docs.openhwgroup.org/projects/cva6-user-manual/01_cva6_user/CV32A6_Control_Status_Registers.html + https://docs.openhwgroup.org/projects/cva6-user-manual/01_cva6_user/CSR_CV32A60X.html ref_mode: page ref_page: '' ref_section: '' diff --git a/verif/docs/VerifPlans/csr_access/VP_IP003.yml b/verif/docs/VerifPlans/csr_access/VP_IP003.yml index 3267c6030f..3099b330be 100644 --- a/verif/docs/VerifPlans/csr_access/VP_IP003.yml +++ b/verif/docs/VerifPlans/csr_access/VP_IP003.yml @@ -16,7 +16,7 @@ subfeatures: !!omap description: Upon reset, RISC-V CVA6 User mode counter CSRs must initialize to their respective POR value. reqt_doc: - https://docs.openhwgroup.org/projects/cva6-user-manual/01_cva6_user/CV32A6_Control_Status_Registers.html + https://docs.openhwgroup.org/projects/cva6-user-manual/01_cva6_user/CSR_CV32A60X.html ref_mode: page ref_page: '' ref_section: '' @@ -51,7 +51,7 @@ subfeatures: !!omap two continuous reads and checking whether the value in the second read is greater than the value in the first read." reqt_doc: - https://docs.openhwgroup.org/projects/cva6-user-manual/01_cva6_user/CV32A6_Control_Status_Registers.html + https://docs.openhwgroup.org/projects/cva6-user-manual/01_cva6_user/CSR_CV32A60X.html ref_mode: page ref_page: '' ref_section: '' @@ -80,7 +80,7 @@ subfeatures: !!omap description: Accessing RISC-V CVA6 user Mode counter CSR in different privilege modes (User, Supervisor and Machine modes). reqt_doc: - https://docs.openhwgroup.org/projects/cva6-user-manual/01_cva6_user/CV32A6_Control_Status_Registers.html + https://docs.openhwgroup.org/projects/cva6-user-manual/01_cva6_user/CSR_CV32A60X.html ref_mode: page ref_page: '' ref_section: '' diff --git a/verif/docs/VerifPlans/csr_access/VP_IP004.yml b/verif/docs/VerifPlans/csr_access/VP_IP004.yml index 8c7d5b8871..2250afd03e 100644 --- a/verif/docs/VerifPlans/csr_access/VP_IP004.yml +++ b/verif/docs/VerifPlans/csr_access/VP_IP004.yml @@ -16,7 +16,7 @@ subfeatures: !!omap description: Upon reset, RISC-V CVA6 Machine mode counter CSRs must initialize to their respective POR value. reqt_doc: - https://docs.openhwgroup.org/projects/cva6-user-manual/01_cva6_user/CV32A6_Control_Status_Registers.html + https://docs.openhwgroup.org/projects/cva6-user-manual/01_cva6_user/CSR_CV32A60X.html ref_mode: page ref_page: '' ref_section: '' @@ -52,7 +52,7 @@ subfeatures: !!omap tested by performing two continuous reads and checking whether the value in the second read is greater than the value in the first read." reqt_doc: - https://docs.openhwgroup.org/projects/cva6-user-manual/01_cva6_user/CV32A6_Control_Status_Registers.html + https://docs.openhwgroup.org/projects/cva6-user-manual/01_cva6_user/CSR_CV32A60X.html ref_mode: page ref_page: '' ref_section: '' @@ -81,7 +81,7 @@ subfeatures: !!omap description: Accessing RISC-V CVA6 user Machine mode counter CSRs in different privilege modes (User, Supervisor and Machine modes). reqt_doc: - https://docs.openhwgroup.org/projects/cva6-user-manual/01_cva6_user/CV32A6_Control_Status_Registers.html + https://docs.openhwgroup.org/projects/cva6-user-manual/01_cva6_user/CSR_CV32A60X.html ref_mode: page ref_page: '' ref_section: '' diff --git a/verif/docs/VerifPlans/csr_access/dvplan_csr-access.md b/verif/docs/VerifPlans/csr_access/dvplan_csr-access.md index 60e8d9e63c..861c79e614 100644 --- a/verif/docs/VerifPlans/csr_access/dvplan_csr-access.md +++ b/verif/docs/VerifPlans/csr_access/dvplan_csr-access.md @@ -89,7 +89,7 @@ Module: CSR ACCESS VERIFICATION[](#module-csr-access-verification "Permalink ##### Item: 000[](#item-000 "Permalink to this headline") -* **Requirement location:** https://docs.openhwgroup.org/projects/cva6-user-manual/01\_cva6\_user/CV32A6\_Control\_Status\_Registers.html +* **Requirement location:** https://docs.openhwgroup.org/projects/cva6-user-manual/01\_cva6\_user/CSR\_CV32A60X.html * **Feature Description** @@ -153,7 +153,7 @@ Module: CSR ACCESS VERIFICATION[](#module-csr-access-verification "Permalink ##### Item: 000[](#id2 "Permalink to this headline") -* **Requirement location:** https://docs.openhwgroup.org/projects/cva6-user-manual/01\_cva6\_user/CV32A6\_Control\_Status\_Registers.html +* **Requirement location:** https://docs.openhwgroup.org/projects/cva6-user-manual/01\_cva6\_user/CSR\_CV32A60X.html * **Feature Description** @@ -188,7 +188,7 @@ Module: CSR ACCESS VERIFICATION[](#module-csr-access-verification "Permalink ##### Item: 000[](#id3 "Permalink to this headline") -* **Requirement location:** https://docs.openhwgroup.org/projects/cva6-user-manual/01\_cva6\_user/CV32A6\_Control\_Status\_Registers.html +* **Requirement location:** https://docs.openhwgroup.org/projects/cva6-user-manual/01\_cva6\_user/CSR\_CV32A60X.html * **Feature Description** @@ -223,7 +223,7 @@ Module: CSR ACCESS VERIFICATION[](#module-csr-access-verification "Permalink ##### Item: 000[](#id5 "Permalink to this headline") -* **Requirement location:** https://docs.openhwgroup.org/projects/cva6-user-manual/01\_cva6\_user/CV32A6\_Control\_Status\_Registers.html +* **Requirement location:** https://docs.openhwgroup.org/projects/cva6-user-manual/01\_cva6\_user/CSR\_CV32A60X.html * **Feature Description** @@ -254,7 +254,7 @@ Module: CSR ACCESS VERIFICATION[](#module-csr-access-verification "Permalink ##### Item: 000[](#id6 "Permalink to this headline") -* **Requirement location:** https://docs.openhwgroup.org/projects/cva6-user-manual/01\_cva6\_user/CV32A6\_Control\_Status\_Registers.html +* **Requirement location:** https://docs.openhwgroup.org/projects/cva6-user-manual/01\_cva6\_user/CSR\_CV32A60X.html * **Feature Description** @@ -287,7 +287,7 @@ Module: CSR ACCESS VERIFICATION[](#module-csr-access-verification "Permalink ##### Item: 000[](#id7 "Permalink to this headline") -* **Requirement location:** https://docs.openhwgroup.org/projects/cva6-user-manual/01\_cva6\_user/CV32A6\_Control\_Status\_Registers.html +* **Requirement location:** https://docs.openhwgroup.org/projects/cva6-user-manual/01\_cva6\_user/CSR\_CV32A60X.html * **Feature Description** @@ -322,7 +322,7 @@ Module: CSR ACCESS VERIFICATION[](#module-csr-access-verification "Permalink ##### Item: 000[](#id9 "Permalink to this headline") -* **Requirement location:** https://docs.openhwgroup.org/projects/cva6-user-manual/01\_cva6\_user/CV32A6\_Control\_Status\_Registers.html +* **Requirement location:** https://docs.openhwgroup.org/projects/cva6-user-manual/01\_cva6\_user/CSR\_CV32A60X.html * **Feature Description** @@ -384,7 +384,7 @@ Module: CSR ACCESS VERIFICATION[](#module-csr-access-verification "Permalink ##### Item: 000[](#id13 "Permalink to this headline") -* **Requirement location:** https://docs.openhwgroup.org/projects/cva6-user-manual/01\_cva6\_user/CV32A6\_Control\_Status\_Registers.html +* **Requirement location:** https://docs.openhwgroup.org/projects/cva6-user-manual/01\_cva6\_user/CSR\_CV32A60X.html * **Feature Description** @@ -417,7 +417,7 @@ Module: CSR ACCESS VERIFICATION[](#module-csr-access-verification "Permalink ##### Item: 000[](#id15 "Permalink to this headline") -* **Requirement location:** https://docs.openhwgroup.org/projects/cva6-user-manual/01\_cva6\_user/CV32A6\_Control\_Status\_Registers.html +* **Requirement location:** https://docs.openhwgroup.org/projects/cva6-user-manual/01\_cva6\_user/CSR\_CV32A60X.html * **Feature Description** @@ -451,7 +451,7 @@ Module: CSR ACCESS VERIFICATION[](#module-csr-access-verification "Permalink ##### Item: 000[](#id17 "Permalink to this headline") -* **Requirement location:** https://docs.openhwgroup.org/projects/cva6-user-manual/01\_cva6\_user/CV32A6\_Control\_Status\_Registers.html +* **Requirement location:** https://docs.openhwgroup.org/projects/cva6-user-manual/01\_cva6\_user/CSR\_CV32A60X.html * **Feature Description** @@ -483,7 +483,7 @@ Module: CSR ACCESS VERIFICATION[](#module-csr-access-verification "Permalink ##### Item: 000[](#id18 "Permalink to this headline") -* **Requirement location:** https://docs.openhwgroup.org/projects/cva6-user-manual/01\_cva6\_user/CV32A6\_Control\_Status\_Registers.html +* **Requirement location:** https://docs.openhwgroup.org/projects/cva6-user-manual/01\_cva6\_user/CSR\_CV32A60X.html * **Feature Description** @@ -524,7 +524,7 @@ Module: CSR ACCESS VERIFICATION[](#module-csr-access-verification "Permalink ##### Item: 000[](#id20 "Permalink to this headline") -* **Requirement location:** https://docs.openhwgroup.org/projects/cva6-user-manual/01\_cva6\_user/CV32A6\_Control\_Status\_Registers.html +* **Requirement location:** https://docs.openhwgroup.org/projects/cva6-user-manual/01\_cva6\_user/CSR\_CV32A60X.html * **Feature Description** @@ -588,7 +588,7 @@ Module: CSR ACCESS VERIFICATION[](#module-csr-access-verification "Permalink ##### Item: 000[](#id23 "Permalink to this headline") -* **Requirement location:** https://docs.openhwgroup.org/projects/cva6-user-manual/01\_cva6\_user/CV32A6\_Control\_Status\_Registers.html +* **Requirement location:** https://docs.openhwgroup.org/projects/cva6-user-manual/01\_cva6\_user/CSR\_CV32A60X.html * **Feature Description** @@ -620,7 +620,7 @@ Module: CSR ACCESS VERIFICATION[](#module-csr-access-verification "Permalink ##### Item: 000[](#id25 "Permalink to this headline") -* **Requirement location:** https://docs.openhwgroup.org/projects/cva6-user-manual/01\_cva6\_user/CV32A6\_Control\_Status\_Registers.html +* **Requirement location:** https://docs.openhwgroup.org/projects/cva6-user-manual/01\_cva6\_user/CSR\_CV32A60X.html * **Feature Description** @@ -662,7 +662,7 @@ Module: CSR ACCESS VERIFICATION[](#module-csr-access-verification "Permalink ##### Item: 000[](#id27 "Permalink to this headline") -* **Requirement location:** https://docs.openhwgroup.org/projects/cva6-user-manual/01\_cva6\_user/CV32A6\_Control\_Status\_Registers.html +* **Requirement location:** https://docs.openhwgroup.org/projects/cva6-user-manual/01\_cva6\_user/CSR\_CV32A60X.html * **Feature Description** diff --git a/verif/env/corev-dv/cva6_asm_program_gen.sv b/verif/env/corev-dv/cva6_asm_program_gen.sv index 0fb37d34b9..d3a6f9a673 100644 --- a/verif/env/corev-dv/cva6_asm_program_gen.sv +++ b/verif/env/corev-dv/cva6_asm_program_gen.sv @@ -175,7 +175,7 @@ class cva6_asm_program_gen_c extends riscv_asm_program_gen; end else begin // Push user mode GPR to kernel stack before executing exception handling, this is to avoid // exception handling routine modify user program state unexpectedly - push_used_gpr_to_kernel_stack(status, scratch, 4, cfg_cva6.mstatus_mprv, cfg_cva6.sp, cfg_cva6.tp, instr); + push_used_gpr_to_kernel_stack(status, scratch, 3, cfg_cva6.mstatus_mprv, cfg_cva6.sp, cfg_cva6.tp, instr); // Checking xStatus can be optional if ISS (like spike) has different implementation of // certain fields compared with the RTL processor. if (cfg_cva6.check_xstatus) begin @@ -187,16 +187,19 @@ class cva6_asm_program_gen_c extends riscv_asm_program_gen; // Check if the exception is caused by an interrupt, if yes, jump to interrupt // handler Interrupt is indicated by xCause[XLEN-1] $sformatf("csrr x%0d, 0x%0x # %0s", cfg_cva6.gpr[0], cause, cause.name()), + $sformatf("srli x%0d, x%0d, %0d", cfg_cva6.gpr[0], cfg_cva6.gpr[0], XLEN-1), + $sformatf("bne x%0d, x0, %0s%0s_intr_handler", + cfg_cva6.gpr[0], hart_prefix(hart), mode), $sformatf("csrr x%0d, mepc", cfg_cva6.gpr[0]), - $sformatf("lbu x%0d, 0(x%0d)", cfg_cva6.gpr[3],cfg_cva6.gpr[0]), + $sformatf("lbu x%0d, 0(x%0d)", cfg_cva6.gpr[2],cfg_cva6.gpr[0]), $sformatf("li x%0d, 0x3", cfg_cva6.gpr[1]), - $sformatf("and x%0d, x%0d, x%0d", cfg_cva6.gpr[3], cfg_cva6.gpr[3], cfg_cva6.gpr[1]), - $sformatf("bne x%0d, x%0d, exception_handler_incr_mepc2", cfg_cva6.gpr[3], cfg_cva6.gpr[1]), + $sformatf("and x%0d, x%0d, x%0d", cfg_cva6.gpr[2], cfg_cva6.gpr[2], cfg_cva6.gpr[1]), + $sformatf("bne x%0d, x%0d, exception_handler_incr_mepc2", cfg_cva6.gpr[2], cfg_cva6.gpr[1]), $sformatf("addi x%0d, x%0d, 2", cfg_cva6.gpr[0], cfg_cva6.gpr[0]), str, $sformatf("addi x%0d, x%0d, 2", cfg_cva6.gpr[0], cfg_cva6.gpr[0]), $sformatf("csrw mepc, x%0d", cfg_cva6.gpr[0])}; - pop_used_gpr_from_kernel_stack(MSTATUS, MSCRATCH, 4, cfg.mstatus_mprv, cfg.sp, cfg.tp, instr); + pop_used_gpr_from_kernel_stack(MSTATUS, MSCRATCH, 3, cfg_cva6.mstatus_mprv, cfg_cva6.sp, cfg_cva6.tp, instr); instr.push_back("mret"); end // The trap handler will occupy one 4KB page, it will be allocated one entry in the page table @@ -213,7 +216,49 @@ class cva6_asm_program_gen_c extends riscv_asm_program_gen; if (cfg_cva6.mtvec_mode == VECTORED) begin push_gpr_to_kernel_stack(status, scratch, cfg_cva6.mstatus_mprv, cfg_cva6.sp, cfg_cva6.tp, instr); end - gen_signature_handshake(instr, CORE_STATUS, HANDLING_EXCEPTION); + //~ push_used_gpr_to_kernel_stack(status, scratch, 3, cfg_cva6.mstatus_mprv, cfg_cva6.sp, cfg_cva6.tp, instr); + instr = {instr, + // The trap is caused by an exception, read back xCAUSE, xEPC to see if these + // CSR values are set properly. + $sformatf("csrr x%0d, 0x%0x # %0s", cfg_cva6.gpr[0], epc, epc.name()), + $sformatf("csrr x%0d, 0x%0x # %0s", cfg_cva6.gpr[0], cause, cause.name()), + $sformatf("li x%0d, 0x8000000b", cfg_cva6.gpr[1]), + $sformatf("li x%0d, 0x80000007", cfg_cva6.gpr[2]), + $sformatf("beq x%0d, x%0d, ext_interrupt_handler", cfg_cva6.gpr[0], cfg_cva6.gpr[1]), + $sformatf("beq x%0d, x%0d, timer_interrupt_handler", cfg_cva6.gpr[0], cfg_cva6.gpr[2]), + $sformatf("j test_done") + }; + gen_section(get_label($sformatf("%0s_intr_handler", mode), hart), instr); + + instr = {}; + instr = {instr, + // The trap is caused by an external interrupt, read back xIP + // Write into int_ack 0x1 value + $sformatf("csrr x%0d, 0x%0x # %0s", cfg_cva6.gpr[0], epc, ip.name()), + $sformatf("li x%0d, 0", cfg_cva6.gpr[0]), + $sformatf("addi x%0d, x%0d, 1", cfg_cva6.gpr[0], cfg_cva6.gpr[0]), + // Clean external pending interrupt + $sformatf("sw x%0d, int_ack, x%0d # %0s;", + cfg_cva6.gpr[0], cfg_cva6.gpr[1], ip.name()) + }; + pop_used_gpr_from_kernel_stack(MSTATUS, MSCRATCH, 3, cfg_cva6.mstatus_mprv, cfg_cva6.sp, cfg_cva6.tp, instr); + instr.push_back("mret"); + gen_section(get_label($sformatf("ext_interrupt_handler"), hart), instr); + + instr = {}; + instr = {instr, + // The trap is caused by a timer interrupt, read back xIP + // Write into int_ack 0x2 value + $sformatf("csrr x%0d, 0x%0x # %0s", cfg_cva6.gpr[0], epc, ip.name()), + $sformatf("li x%0d, 0", cfg_cva6.gpr[0]), + $sformatf("addi x%0d, x%0d, 2", cfg_cva6.gpr[0], cfg_cva6.gpr[0]), + // Clean timer pending interrupt + $sformatf("sw x%0d, int_ack, x%0d", + cfg_cva6.gpr[0], cfg_cva6.gpr[1]) + }; + pop_used_gpr_from_kernel_stack(MSTATUS, MSCRATCH, 3, cfg_cva6.mstatus_mprv, cfg_cva6.sp, cfg_cva6.tp, instr); + instr.push_back("mret"); + gen_section(get_label($sformatf("timer_interrupt_handler"), hart), instr); endfunction // Push used general purpose register to stack, this is needed before trap handling @@ -314,31 +359,31 @@ class cva6_asm_program_gen_c extends riscv_asm_program_gen; test_result_t test_result = TEST_FAIL, privileged_reg_t csr = MSCRATCH, string addr_label = ""); - if (cfg.require_signature_addr) begin + if (cfg_cva6.require_signature_addr) begin string str[$]; - str = {$sformatf("li x%0d, 0x%0h", cfg.gpr[1], cfg.signature_addr)}; + str = {$sformatf("li x%0d, 0x%0h", cfg_cva6.gpr[1], cfg_cva6.signature_addr)}; instr = {instr, str}; case (signature_type) // A single data word is written to the signature address. // Bits [7:0] contain the signature_type of CORE_STATUS, and the upper // XLEN-8 bits contain the core_status_t data. CORE_STATUS: begin - str = {$sformatf("li x%0d, 0x%0h", cfg.gpr[0], core_status), - $sformatf("slli x%0d, x%0d, 8", cfg.gpr[0], cfg.gpr[0]), - $sformatf("addi x%0d, x%0d, 0x%0h", cfg.gpr[0], - cfg.gpr[0], signature_type), - $sformatf("sw x%0d, 0(x%0d)", cfg.gpr[0], cfg.gpr[1])}; + str = {$sformatf("li x%0d, 0x%0h", cfg_cva6.gpr[0], core_status), + $sformatf("slli x%0d, x%0d, 8", cfg_cva6.gpr[0], cfg_cva6.gpr[0]), + $sformatf("addi x%0d, x%0d, 0x%0h", cfg_cva6.gpr[0], + cfg_cva6.gpr[0], signature_type), + $sformatf("sw x%0d, 0(x%0d)", cfg_cva6.gpr[0], cfg_cva6.gpr[1])}; instr = {instr, str}; end // A single data word is written to the signature address. // Bits [7:0] contain the signature_type of TEST_RESULT, and the upper // XLEN-8 bits contain the test_result_t data. TEST_RESULT: begin - str = {$sformatf("li x%0d, 0x%0h", cfg.gpr[0], test_result), - $sformatf("slli x%0d, x%0d, 8", cfg.gpr[0], cfg.gpr[0]), - $sformatf("addi x%0d, x%0d, 0x%0h", cfg.gpr[0], - cfg.gpr[0], signature_type), - $sformatf("sw x%0d, 0(x%0d)", cfg.gpr[0], cfg.gpr[1])}; + str = {$sformatf("li x%0d, 0x%0h", cfg_cva6.gpr[0], test_result), + $sformatf("slli x%0d, x%0d, 8", cfg_cva6.gpr[0], cfg_cva6.gpr[0]), + $sformatf("addi x%0d, x%0d, 0x%0h", cfg_cva6.gpr[0], + cfg_cva6.gpr[0], signature_type), + $sformatf("sw x%0d, 0(x%0d)", cfg_cva6.gpr[0], cfg_cva6.gpr[1])}; instr = {instr, str}; end // The first write to the signature address contains just the @@ -347,11 +392,11 @@ class cva6_asm_program_gen_c extends riscv_asm_program_gen; // each writing the data contained in one GPR, starting from x0 as the // first write, and ending with x31 as the 32nd write. WRITE_GPR: begin - str = {$sformatf("li x%0d, 0x%0h", cfg.gpr[0], signature_type), - $sformatf("sw x%0d, 0(x%0d)", cfg.gpr[0], cfg.gpr[1])}; + str = {$sformatf("li x%0d, 0x%0h", cfg_cva6.gpr[0], signature_type), + $sformatf("sw x%0d, 0(x%0d)", cfg_cva6.gpr[0], cfg_cva6.gpr[1])}; instr = {instr, str}; for(int i = 0; i < 32; i++) begin - str = {$sformatf("sw x%0x, 0(x%0d)", i, cfg.gpr[1])}; + str = {$sformatf("sw x%0x, 0(x%0d)", i, cfg_cva6.gpr[1])}; instr = {instr, str}; end end @@ -364,13 +409,13 @@ class cva6_asm_program_gen_c extends riscv_asm_program_gen; if (!(csr inside {implemented_csr})) begin return; end - str = {$sformatf("li x%0d, 0x%0h", cfg.gpr[0], csr), - $sformatf("slli x%0d, x%0d, 8", cfg.gpr[0], cfg.gpr[0]), - $sformatf("addi x%0d, x%0d, 0x%0h", cfg.gpr[0], - cfg.gpr[0], signature_type), - $sformatf("sw x%0d, 0(x%0d)", cfg.gpr[0], cfg.gpr[1]), - $sformatf("csrr x%0d, 0x%0h", cfg.gpr[0], csr), - $sformatf("sw x%0d, 0(x%0d)", cfg.gpr[0], cfg.gpr[1])}; + str = {$sformatf("li x%0d, 0x%0h", cfg_cva6.gpr[0], csr), + $sformatf("slli x%0d, x%0d, 8", cfg_cva6.gpr[0], cfg_cva6.gpr[0]), + $sformatf("addi x%0d, x%0d, 0x%0h", cfg_cva6.gpr[0], + cfg_cva6.gpr[0], signature_type), + $sformatf("sw x%0d, 0(x%0d)", cfg_cva6.gpr[0], cfg_cva6.gpr[1]), + $sformatf("csrr x%0d, 0x%0h", cfg_cva6.gpr[0], csr), + $sformatf("sw x%0d, 0(x%0d)", cfg_cva6.gpr[0], cfg_cva6.gpr[1])}; instr = {instr, str}; end default: begin @@ -385,9 +430,19 @@ class cva6_asm_program_gen_c extends riscv_asm_program_gen; instr_stream.push_back(str); instr_stream.push_back({indent, "li gp, 1"}); instr_stream.push_back({indent, "sw gp, tohost, t5"}); - instr_stream.push_back({indent, "wfi"}); + instr_stream.push_back({indent, "end_of_test: j end_of_test"}); endfunction + + virtual function void gen_data_page_begin(int hart); + instr_stream.push_back(".section .data"); + if (hart == 0) begin + instr_stream.push_back(".align 6; .global tohost; tohost: .dword 0;"); + instr_stream.push_back(".align 6; .global fromhost; fromhost: .dword 0;"); + instr_stream.push_back(".align 6; .global int_ack; int_ack: .dword 0;"); + end + endfunction + endclass : cva6_asm_program_gen_c `endif // __CVA6_ASM_PROGRAM_GEN_SV__ diff --git a/verif/env/uvme/cov/uvme_axi_covg.sv b/verif/env/uvme/cov/uvme_axi_covg.sv index 78b9d23968..3e85365666 100644 --- a/verif/env/uvme/cov/uvme_axi_covg.sv +++ b/verif/env/uvme/cov/uvme_axi_covg.sv @@ -264,22 +264,22 @@ task uvme_axi_covg_c::run_phase(uvm_phase phase); disable fork; if(aw_item != null) begin - `uvm_info(get_type_name(), $sformatf("WRITE REQ ITEM DETECTED"), UVM_LOW) + `uvm_info(get_type_name(), $sformatf("WRITE REQ ITEM DETECTED"), UVM_HIGH) w_axi_cg.sample(aw_item, RVA); end if(b_item != null) begin - `uvm_info(get_type_name(), $sformatf("WRITE RESP ITEM DETECTED"), UVM_LOW) + `uvm_info(get_type_name(), $sformatf("WRITE RESP ITEM DETECTED"), UVM_HIGH) b_axi_cg.sample(b_item, RVA, HPDCache); end if(ar_item != null) begin - `uvm_info(get_type_name(), $sformatf("READ ADDRESS ITEM DETECTED"), UVM_LOW) + `uvm_info(get_type_name(), $sformatf("READ ADDRESS ITEM DETECTED"), UVM_HIGH) ar_axi_cg.sample(ar_item, RVA, HPDCache); end if(r_item != null) begin - `uvm_info(get_type_name(), $sformatf("READ DATA ITEM DETECTED"), UVM_LOW) + `uvm_info(get_type_name(), $sformatf("READ DATA ITEM DETECTED"), UVM_HIGH) for(int i = 0; i <= r_item.m_len; i++) begin r_axi_cg.sample(r_item, i, RVA, HPDCache); end diff --git a/verif/env/uvme/cov/uvme_axi_ext_covg.sv b/verif/env/uvme/cov/uvme_axi_ext_covg.sv index 10e725b1ad..12d8facb05 100644 --- a/verif/env/uvme/cov/uvme_axi_ext_covg.sv +++ b/verif/env/uvme/cov/uvme_axi_ext_covg.sv @@ -82,7 +82,7 @@ covergroup cg_axi_ar_order(string name) } ar_axi_outstanding_cross: cross outstanding_resp, outstanding_last_resp, arid1, arlen1, arid2, arlen2{ - ignore_bins IGN_CROSS1 = binsof(outstanding_resp) intersect{1} && + ignore_bins IGN_CROSS1 = binsof(outstanding_resp) intersect{1} && binsof(outstanding_last_resp) intersect{1}; } @@ -90,16 +90,16 @@ covergroup cg_axi_ar_order(string name) ignore_bins IGN_CROSS1 = binsof(outoforder_resp_id0) intersect{1} && binsof(outoforder_last_resp_id0) intersect{0} && binsof(arlen2) intersect{0}; - ignore_bins IGN_CROSS2 = binsof(outoforder_resp_id0) intersect{0} && + ignore_bins IGN_CROSS2 = binsof(outoforder_resp_id0) intersect{0} && binsof(outoforder_last_resp_id0) intersect{1} && binsof(arlen1) intersect{0}; } aw_axi_outoforder_id1_cross: cross outoforder_resp_id1, outoforder_last_resp_id1, arlen1, arlen2{ - ignore_bins IGN_CROSS1 = binsof(outoforder_resp_id1) intersect{1} && + ignore_bins IGN_CROSS1 = binsof(outoforder_resp_id1) intersect{1} && binsof(outoforder_last_resp_id1) intersect{0} && binsof(arlen2) intersect{0}; - ignore_bins IGN_CROSS2 = binsof(outoforder_resp_id1) intersect{0} && + ignore_bins IGN_CROSS2 = binsof(outoforder_resp_id1) intersect{0} && binsof(outoforder_last_resp_id1) intersect{1} && binsof(arlen1) intersect{0}; } @@ -119,7 +119,7 @@ class uvme_axi_ext_covg_c extends uvm_component; int t_r1l_to_ar; // <0 (outstanding) int t_r1_to_r2; // <0 (r2 run before r1) int t_r1l_to_r2l; // <0 (last r2 run before last r1) - + int write_resp_status = 0; int read_resp_status = 0; @@ -219,7 +219,7 @@ task uvme_axi_ext_covg_c::run_phase(uvm_phase phase); get_ar_item(); get_r_item(); join_any - + if(aw_trs_fifo.size() == 2 && write_resp_status == 2) begin aw_time_operations(); aw_axi_order_cg.sample(t_b1_to_aw, t_w1_to_aw); @@ -233,7 +233,7 @@ task uvme_axi_ext_covg_c::run_phase(uvm_phase phase); ar_trs_fifo = new [ar_trs_fifo.size()-1] (ar_trs_fifo); read_resp_status--; end - + disable fork; end @@ -244,7 +244,7 @@ task uvme_axi_ext_covg_c::get_aw_item(); uvma_axi_transaction_c aw_item; uvme_axi_cov_aw_req_fifo.get(aw_item); - `uvm_info(get_type_name(), $sformatf("WRITE REQ ITEM DETECTED"), UVM_LOW) + `uvm_info(get_type_name(), $sformatf("WRITE REQ ITEM DETECTED"), UVM_HIGH) aw_trs_fifo = new [aw_trs_fifo.size()+1] (aw_trs_fifo); aw_trs_fifo[aw_trs_fifo.size()-1] = new aw_item; @@ -255,7 +255,7 @@ task uvme_axi_ext_covg_c::get_ar_item(); uvma_axi_transaction_c ar_item; uvme_axi_cov_ar_req_fifo.get(ar_item); - `uvm_info(get_type_name(), $sformatf("READ REQ ITEM DETECTED"), UVM_LOW) + `uvm_info(get_type_name(), $sformatf("READ REQ ITEM DETECTED"), UVM_HIGH) ar_trs_fifo = new [ar_trs_fifo.size()+1] (ar_trs_fifo); ar_trs_fifo[ar_trs_fifo.size()-1] = new ar_item; @@ -266,7 +266,7 @@ task uvme_axi_ext_covg_c::get_b_item(); uvma_axi_transaction_c b_item; uvme_axi_cov_b_resp_fifo.get(b_item); - `uvm_info(get_type_name(), $sformatf("WRITE RESP ITEM DETECTED"), UVM_LOW) + `uvm_info(get_type_name(), $sformatf("WRITE RESP ITEM DETECTED"), UVM_HIGH) foreach(aw_trs_fifo[i]) begin if (aw_trs_fifo[i].m_id == b_item.m_id) begin aw_trs_fifo[i].m_resp = b_item.m_resp; @@ -285,7 +285,7 @@ task uvme_axi_ext_covg_c::get_r_item(); uvma_axi_transaction_c r_item; uvme_axi_cov_r_resp_fifo.get(r_item); - `uvm_info(get_type_name(), $sformatf("READ RESP ITEM DETECTED"), UVM_LOW) + `uvm_info(get_type_name(), $sformatf("READ RESP ITEM DETECTED"), UVM_HIGH) foreach(ar_trs_fifo[i]) begin if (ar_trs_fifo[i].m_id == r_item.m_id) begin ar_trs_fifo[i].m_resp.push_back(r_item.m_resp[0]); diff --git a/verif/env/uvme/cov/uvme_cva6_config_covg.sv b/verif/env/uvme/cov/uvme_cva6_config_covg.sv index 364037f863..903b6c381d 100644 --- a/verif/env/uvme/cov/uvme_cva6_config_covg.sv +++ b/verif/env/uvme/cov/uvme_cva6_config_covg.sv @@ -214,7 +214,7 @@ endfunction : build_phase function void uvme_cva6_config_covg_c::sample_cva6_config(); - config_cg.sample(cfg.CVA6Cfg); + config_cg.sample(RTLCVA6Cfg); boot_addr_cg.sample(cfg); clock_period_cg.sample(cfg.sys_clk_period); diff --git a/verif/env/uvme/cov/uvme_cva6_cov_model.sv b/verif/env/uvme/cov/uvme_cva6_cov_model.sv index 63edcd4c5d..93d0ebb5a6 100644 --- a/verif/env/uvme/cov/uvme_cva6_cov_model.sv +++ b/verif/env/uvme/cov/uvme_cva6_cov_model.sv @@ -30,7 +30,6 @@ class uvme_cva6_cov_model_c extends uvm_component; uvme_cva6_cntxt_c cntxt; // Components - uvme_cvxif_covg_c cvxif_covg; uvme_isa_cov_model_c isa_covg; uvme_cva6_config_covg_c config_covg; uvme_illegal_instr_cov_model_c illegal_covg; @@ -102,12 +101,6 @@ function void uvme_cva6_cov_model_c::build_phase(uvm_phase phase); `uvm_fatal("CNTXT", "Context handle is null") end - if (cfg.cvxif_cfg.cov_model_enabled) begin - cvxif_covg = uvme_cvxif_covg_c::type_id::create("cvxif_covg", this); - uvm_config_db#(uvme_cva6_cfg_c)::set(this, "cvxif_covg", "cfg", cfg); - uvm_config_db#(uvme_cva6_cntxt_c)::set(this, "cvxif_covg", "cntxt", cntxt); - end - if (cfg.isacov_cfg.cov_model_enabled) begin isa_covg = uvme_isa_cov_model_c::type_id::create("isa_covg", this); illegal_covg = uvme_illegal_instr_cov_model_c::type_id::create("illegal_covg", this); diff --git a/verif/env/uvme/cov/uvme_exception_covg.sv b/verif/env/uvme/cov/uvme_exception_covg.sv index d56e868ae6..a91e730ef6 100644 --- a/verif/env/uvme/cov/uvme_exception_covg.sv +++ b/verif/env/uvme/cov/uvme_exception_covg.sv @@ -21,6 +21,7 @@ covergroup cg_exception( string name, bit pmp_supported, + bit MmuPresent, bit unaligned_access_supported, bit ext_c_supported, bit mode_u_supported, @@ -63,6 +64,7 @@ covergroup cg_exception( bins ENV_CALL_MMODE = {11} iff (instr.trap); + ignore_bins IGN_PAGE_FAULT_EXC = {12, 13, 15} iff (!MmuPresent); bins INSTR_PAGE_FAULT = {12} iff (instr.trap); bins LOAD_PAGE_FAULT = {13} iff (instr.trap); @@ -199,6 +201,7 @@ function void uvme_exception_cov_model_c::build_phase(uvm_phase phase); exception_cg = new("exception_cg", .pmp_supported(cfg.pmp_supported), + .MmuPresent(cfg.MmuPresent), .unaligned_access_supported(cfg.unaligned_access_supported), .ext_c_supported(cfg.ext_c_supported), .mode_u_supported(cfg.mode_u_supported), diff --git a/verif/env/uvme/cov/uvme_interrupt_covg.sv b/verif/env/uvme/cov/uvme_interrupt_covg.sv index df9a8261f3..e14097de6d 100644 --- a/verif/env/uvme/cov/uvme_interrupt_covg.sv +++ b/verif/env/uvme/cov/uvme_interrupt_covg.sv @@ -19,7 +19,8 @@ // Original Author: Ayoub JALALI (ayoub.jalali@external.thalesgroup.com) covergroup cg_interrupt( - string name + string name, + bit sw_int_supported ) with function sample ( uvma_isacov_instr_c instr ); @@ -29,6 +30,7 @@ covergroup cg_interrupt( cp_interrupt: coverpoint instr.rvfi.name_csrs["mcause"].wdata { bins NO_INTERRUPT = {0} iff (!instr.trap); + ignore_bins IGN_SOFTWARE_INTERRUPT = {32'h80000003} iff (!sw_int_supported); bins MACHINE_MODE_EXTERNAL_INTERRUPT = {32'h8000000b} iff (instr.trap); bins MACHINE_MODE_SOFTWARE_INTERRUPT = {32'h80000003} iff (instr.trap); bins MACHINE_MODE_TIMER_INTERRUPT = {32'h80000007} iff (instr.trap); @@ -40,6 +42,7 @@ covergroup cg_interrupt( } cp_msie: coverpoint instr.rvfi.name_csrs["mie"].wdata[3] { + ignore_bins IGN_MSIE = {1'h1} iff (!sw_int_supported); bins MSIE = {1'h1}; } @@ -52,6 +55,7 @@ covergroup cg_interrupt( } cp_msip: coverpoint instr.rvfi.name_csrs["mip"].wdata[3] { + ignore_bins IGN_MSIP = {1'h1} iff (!sw_int_supported); bins MSIP = {1'h1}; } @@ -105,7 +109,10 @@ function void uvme_interrupt_covg_c::build_phase(uvm_phase phase); `uvm_fatal("CFG", "Configuration handle is null") end - interrupt_cg = new("interrupt_cg"); + if (!cfg.disable_all_csr_checks) + interrupt_cg = new("interrupt_cg", + .sw_int_supported(cfg.sw_int_supported)); else + `uvm_warning(get_type_name(), "Interrupt coverage will not be scored since config disable_all_csr_checks is true") mon_trn_fifo = new("mon_trn_fifo" , this); @@ -117,10 +124,10 @@ task uvme_interrupt_covg_c::run_phase(uvm_phase phase); `uvm_info(get_type_name(), "The Interrupt env coverage model is running", UVM_LOW); - forever begin - mon_trn_fifo.get(mon_trn); - interrupt_cg.sample(mon_trn.instr); - end + if (!cfg.disable_all_csr_checks) + forever begin + mon_trn_fifo.get(mon_trn); + interrupt_cg.sample(mon_trn.instr); + end endtask : run_phase - diff --git a/verif/env/uvme/cvxif_vseq/custom_instructions_cvxif_1_0_0.rst b/verif/env/uvme/cvxif_vseq/custom_instructions_cvxif_1_0_0.rst new file mode 100644 index 0000000000..ee53871c86 --- /dev/null +++ b/verif/env/uvme/cvxif_vseq/custom_instructions_cvxif_1_0_0.rst @@ -0,0 +1,115 @@ +.. + Copyright (c) 2023 OpenHW Group + + Copyright (c) 2023 Thales DIS design services SAS + + + SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 + +.. + +Custom Instruction to challenge CV-X-IF protocol +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +This section describes some custom instruction, for stress or challenge the CV-X-IF protocol for the 3 implemented interfaces, it's just to interact with the cvxif agent. +Most instructions use opcode `CUSTOM_3`(0x7b, 0b111_1011). +Except for 4 of them using opcode `MADD, MSUB, NMADD, NMSUB` + +- **CUS_NOP**: Custom No Operation + + **Format**: cus_nop -> |0000000000000000000000000|111_1011| + + **Description**: do nothing, it's just a hint instruction. + + **Pseudocode**: cus_nop + +- **CUS_ADD**: Custom Add + + **Format**: cus_add rd, rs1, rs2 -> |0000000|rs2|rs1|001|rd|111_1011| + + **Description**: add register rs1 to rs2, and store the result in rd. + + **Pseudocode**: x[rd] = x[rs1] + x[rs2] + +- **CUS_DOUBLE_RS1**: Custom Double RS1 + + **Format**: cus_add rd, rs1, rs1 -> |0000001|rs2|rs1|001|rd|111_1011| + + **Description**: add register rs1 to rs1, and store the result in rd. + Any rs2 value can be given. It should be ignored by CPU. + Exists to check that register depedencies is well implemented in CPU. + + **Pseudocode**: x[rd] = x[rs1] + x[rs1] + +- **CUS_DOUBLE_RS2**: Custom Double RS2 + + **Format**: cus_add rd, rs2, rs2 -> |0000010|rs2|rs1|001|rd|111_1011| + + **Description**: add register rs2 to rs2, and store the result in rd. + Any rs1 value can be given. It should be ignored by CPU. + Exists to check that register depedencies is well implemented in CPU. + + **Pseudocode**: x[rd] = x[rs2] + x[rs2] + +- **CUS_ADD_MULTI**: Custom Multicycle Add + + **Format**: addi rd, rs1, rs2 -> |0000011|rs2|rs1|001|rd|111_1011| + + **Description**: add register rs1 to rs2, and store the result in rd. Coprocessor should randomly delays the result + + **Pseudocode**: x[rd] = x[rs1] + x[rs2] + +- **CUS_ADD_RS3_MADD**: Custom Add with RS3 opcode == MADD + + **Format**: addi rd, rs1, rs2, rs3 -> |rs3|00|rs2|rs1|000|rd|100_0011| + + **Description**: add register rs1, rs2 to rs3, and store the result in rd. + + **Pseudocode**: x[rd] = x[rs1] + x[rs2] + x[rs3] + +- **CUS_ADD_RS3_MSUB**: Custom Add with RS3 opcode == MSUB + + **Format**: addi rd, rs1, rs2, rs3 -> |rs3|00|rs2|rs1|000|rd|100_0111| + + **Description**: add register rs1, rs2 to rs3, and store the result in rd. + + **Pseudocode**: x[rd] = x[rs1] + x[rs2] + x[rs3] + +- **CUS_ADD_RS3_NMADD**: Custom Add with RS3 opcode == NMADD + + **Format**: addi rd, rs1, rs2, rs3 -> |rs3|00|rs2|rs1|000|rd|100_1111| + + **Description**: add register rs1, rs2 to rs3, and store the result in rd. + + **Pseudocode**: x[rd] = x[rs1] + x[rs2] + x[rs3] + +- **CUS_ADD_RS3_NMSUB**: Custom Add with RS3 opcode == NMSUB + + **Format**: addi rd, rs1, rs2, rs3 -> |rs3|00|rs2|rs1|000|rd|100_1011| + + **Description**: add register rs1, rs2 to rs3, and store the result in rd. + + **Pseudocode**: x[rd] = x[rs1] + x[rs2] + x[rs3] + +- **CUS_ADD_RS3_RTYPE**: Custom Add with RS3, rd is x10 (a0) + + **Format**: addi a0, rs1, rs2, rs3 -> |0000100|rs2|rs1|001|rs3|100_0011| + + **Description**: add register rs1, rs2 to rs3, and store the result in x10 (a0). + + **Pseudocode**: x[10] = x[rs1] + x[rs2] + x[rs3] + +- **CUS_CNOP** : Custom Compressed NOP + + **Format**: cus_cnop -> |111|0|rs1|rs2|00| + + **Description**: Extends to CUS_NOP : do nothing, it's just a hint instruction. + + **Pseudocode**: cus_cnop + +- **CUS_CADD** : Custom Compressed ADD + + **Format**: cus_cnop -> |111|1|rs1|rs2|00| + + **Description**: Extends to CUS_ADD rs1, rs2 -> x10 : Add rs1 + rs2 into x10 (a0). + + **Pseudocode**: cus_cadd diff --git a/verif/env/uvme/uvma_interrupt/README.md b/verif/env/uvme/uvma_interrupt/README.md index 2994761f7d..37edcd36ab 100644 --- a/verif/env/uvme/uvma_interrupt/README.md +++ b/verif/env/uvme/uvma_interrupt/README.md @@ -1,16 +1,3 @@ -Description of the interrupt agent. +*Interrupt Agent documentation:* -- The interrupt agent supports mainly 3 modes: - 1 - The agent sends one interrupt request, then we deassert it. - 2 - The agent sends several interrupt requests at the same time, with the same size, then we deassert the interrupt requests. - 3 - The agent sends randomized interrupt requests. - -- The interrupt agent has 2 type of delays in `uvma_interrupt_seq_item.sv`: - 1 - `irq_delay` is related to the delay between two interrupt request. - 2 - `irq_time` is related to the time the interrupt request could take. - -- The interrupt agent sends requests asynchronously. - -- To enable interrupt requests you should add the option `"+enable_interrupt"`. - -- There is no mechanism to clear the interrupt requests (on going). +You can see the UVM interrupt agent documentation in : https://gitlab-tss.gemalto.com/riscv/ohg-pr/cva6/-/blob/master/docs/04_cv32a65x/tristan/verif-spec/verification_specifications.adoc diff --git a/verif/env/uvme/uvma_interrupt/cov/uvma_interrupt_cov_model.sv b/verif/env/uvme/uvma_interrupt/cov/uvma_interrupt_cov_model.sv index b4c152223b..d9dfabe362 100644 --- a/verif/env/uvme/uvma_interrupt/cov/uvma_interrupt_cov_model.sv +++ b/verif/env/uvme/uvma_interrupt/cov/uvma_interrupt_cov_model.sv @@ -12,14 +12,17 @@ `define __UVMA_INTERRUPT_COV_MODEL_SV__ covergroup cg_interrupt( - string name + string name, + int unsigned num_irq_supported ) with function sample(uvma_interrupt_seq_item_c req_item); option.per_instance = 1; option.name = name; - cp_interrupt_req : coverpoint req_item.interrupt_valid; + cp_interrupt_req : coverpoint req_item.interrupt_vector { + bins INTERRUPTS[] = {[0:$]} with (item inside {[0:(2**(num_irq_supported))-1]}); + } endgroup: cg_interrupt @@ -77,15 +80,16 @@ function void uvma_interrupt_cov_model_c::build_phase(uvm_phase phase); void'(uvm_config_db#(uvma_interrupt_cfg_c)::get(this, "", "cfg", cfg)); if (cfg == null) begin - `uvm_fatal("CFG", "Configuration handle is null") + `uvm_fatal(get_type_name(), "Configuration handle is null") end void'(uvm_config_db#(uvma_interrupt_cntxt_c)::get(this, "", "cntxt", cntxt)); if (cntxt == null) begin - `uvm_fatal("CNTXT", "Context handle is null") + `uvm_fatal(get_type_name(), "Context handle is null") end - interrupt_cg = new("interrupt_cg"); + interrupt_cg = new("interrupt_cg", + .num_irq_supported(cfg.num_irq_supported)); seq_item_fifo = new("seq_item_fifo", this); diff --git a/verif/env/uvme/uvma_interrupt/seq/uvma_interrupt_seq.sv b/verif/env/uvme/uvma_interrupt/seq/uvma_interrupt_seq.sv index 93099dacc6..29264bdecf 100644 --- a/verif/env/uvme/uvma_interrupt/seq/uvma_interrupt_seq.sv +++ b/verif/env/uvme/uvma_interrupt/seq/uvma_interrupt_seq.sv @@ -11,25 +11,47 @@ `ifndef __UVMA_INTERRUPT_SEQ_SV__ `define __UVMA_INTERRUPT_SEQ_SV__ - -/** - * Abstract object from which all other Interrupt agent sequences must extend. - * Subclasses must be run on Interrupt sequencer (uvma_interrupt_sqr_c) instance. - */ class uvma_interrupt_seq_c extends uvma_interrupt_base_seq_c; `uvm_object_utils(uvma_interrupt_seq_c) `uvm_declare_p_sequencer(uvma_interrupt_sqr_c) + bit [XLEN-1:0] IRQ_ACK_VALUE = 'h0; + int unsigned IRQ_TIMEOUT; + /** * Default constructor. */ extern function new(string name="uvma_interrupt_seq"); + extern virtual task automatic clear_irq_channel(int channel, uvma_interrupt_seq_item_c req_item); + extern virtual task body(); endclass : uvma_interrupt_seq_c +task automatic uvma_interrupt_seq_c::clear_irq_channel(int channel, uvma_interrupt_seq_item_c req_item); + + IRQ_TIMEOUT = cfg.irq_timeout; + while(1) begin + IRQ_ACK_VALUE = cntxt.mem.read(cfg.irq_addr); + if (IRQ_ACK_VALUE[channel]) begin + req_item.interrupt_vector[channel] = 1'h0; + `uvm_info(get_type_name(), $sformatf("Clear interrupt channel N-%2d -> mem = 0x%x",channel, IRQ_ACK_VALUE), UVM_NONE); + IRQ_TIMEOUT = cfg.irq_timeout; + break; + end + else begin + if (IRQ_TIMEOUT == 0) begin + `uvm_fatal(get_type_name(), $sformatf("Timeout : failed to write into irq_add to clear pending interrupts")); + end + IRQ_TIMEOUT = IRQ_TIMEOUT - 1; + end + @(posedge cntxt.interrupt_vif.clk); + end + +endtask : clear_irq_channel + function uvma_interrupt_seq_c::new(string name="uvma_interrupt_seq"); super.new(name); @@ -38,22 +60,47 @@ endfunction : new task uvma_interrupt_seq_c::body(); - forever begin - req_item = uvma_interrupt_seq_item_c::type_id::create("req_item"); - - start_item(req_item); - assert(req_item.randomize() with { - if(!cfg.enable_interrupt){ - req_item.interrupt_valid == 'h0; - } - else { - req_item.irq_cntrl != UVMA_INTERRUPT_RANDOMIZE; - } - }) - cfg.calc_random_req_latency(); - - finish_item(req_item); - end + if (cfg.enable_interrupt) begin + for (int i = 0; i < cfg.num_irq_supported; i++) begin + automatic int ii = i; + automatic uvma_interrupt_seq_item_c req_item_c; + fork begin + forever begin + // Set interrupt request per channel + req_item_c = uvma_interrupt_seq_item_c::type_id::create("req_item_c"); + start_item(req_item_c); + if (!req_item_c.randomize() with { + req_item_c.interrupt_vector[ii] inside {0 , 1}; + req_item_c.interrupt_channel_mask == 1< 0); nr_pmp_entries == 64; - debug_supported == CVA6Cfg.DebugEn; unaligned_access_supported == 0; unaligned_access_amo_supported == 0; @@ -155,10 +125,11 @@ class uvme_cva6_cfg_c extends uvma_core_cntrl_cfg_c; dm_halt_addr_valid == 1; dm_exception_addr_valid == 1; nmi_addr_valid == 1; - HPDCache_supported == (CVA6Cfg.DCacheType == 2); + HPDCache_supported == (RTLCVA6Cfg.DCacheType == 2); - DirectVecOnly == CVA6Cfg.DirectVecOnly; - TvalEn == CVA6Cfg.TvalEn; + MmuPresent == RTLCVA6Cfg.MmuPresent; + // TODO : add RTL paramater related to this field fix issue#2500 + sw_int_supported == 0; } constraint ext_const { @@ -208,7 +179,7 @@ class uvme_cva6_cfg_c extends uvma_core_cntrl_cfg_c; isacov_cfg.seq_instr_x2_enabled == 1; isacov_cfg.reg_crosses_enabled == 0; isacov_cfg.reg_hazards_enabled == 1; - rvfi_cfg.nret == CVA6Cfg.NrCommitPorts; + rvfi_cfg.nret == RTLCVA6Cfg.NrCommitPorts; unified_traps == 0; axi_cfg.rand_channel_delay_enabled == 0; @@ -224,13 +195,21 @@ class uvme_cva6_cfg_c extends uvma_core_cntrl_cfg_c; axi_cfg.trn_log_enabled == 1; rvfi_cfg.trn_log_enabled == 1; isacov_cfg.trn_log_enabled == 1; + } else { + clknrst_cfg.trn_log_enabled == 0; + axi_cfg.trn_log_enabled == 0; + rvfi_cfg.trn_log_enabled == 0; + isacov_cfg.trn_log_enabled == 0; } if (cov_model_enabled) { - cvxif_cfg.cov_model_enabled == 1; isacov_cfg.cov_model_enabled == 1; axi_cfg.cov_model_enabled == 1; interrupt_cfg.cov_model_enabled == 1; + } else { + isacov_cfg.cov_model_enabled == 0; + axi_cfg.cov_model_enabled == 0; + interrupt_cfg.cov_model_enabled == 0; } } @@ -250,6 +229,13 @@ class uvme_cva6_cfg_c extends uvma_core_cntrl_cfg_c; */ extern virtual function void set_unsupported_csr_mask(); + extern virtual function void read_disable_csr_check_plusargs(); + + /** + * Get irq_addr ack + */ + extern virtual function bit [XLEN-1:0] get_irq_addr(); + endclass : uvme_cva6_cfg_c @@ -258,7 +244,6 @@ function uvme_cva6_cfg_c::new(string name="uvme_cva6_cfg"); super.new(name); clknrst_cfg = uvma_clknrst_cfg_c::type_id::create("clknrst_cfg"); - cvxif_cfg = uvma_cvxif_cfg_c::type_id::create("cvxif_cfg"); axi_cfg = uvma_axi_cfg_c::type_id::create("axi_cfg"); rvfi_cfg = uvma_rvfi_cfg_c#(ILEN,XLEN)::type_id::create("rvfi_cfg"); isacov_cfg = uvma_isacov_cfg_c::type_id::create("isacov_cfg"); @@ -269,6 +254,11 @@ function uvme_cva6_cfg_c::new(string name="uvme_cva6_cfg"); $value$plusargs("core_name=%s", this.core_name); + if ($test$plusargs("tb_performance_mode")) begin + performance_mode = 1; + `uvm_info(get_type_name(), "Testbench set in performance mode, coverage & csr checks & scoreboard & loggers will be deactivated", UVM_NONE); + end + endfunction : new function void uvme_cva6_cfg_c::sample_parameters(uvma_core_cntrl_cntxt_c cntxt); @@ -301,13 +291,29 @@ function void uvme_cva6_cfg_c::sample_parameters(uvma_core_cntrl_cntxt_c cntxt); endfunction : sample_parameters +function bit [XLEN-1:0] uvme_cva6_cfg_c::get_irq_addr(); + + int unsigned IRQ_ADDR; + string binary; + + if (!$value$plusargs("irq_addr=%h", IRQ_ADDR)) IRQ_ADDR = '0; + if (IRQ_ADDR == '0) begin + if (!$value$plusargs("elf_file=%s", binary)) binary = ""; + if (binary != "") begin + read_elf(binary); + read_symbol("int_ack", IRQ_ADDR); + end + `uvm_info(get_type_name(), $sformatf("[IRQ] INFO: int_ack_addr: %h", IRQ_ADDR), UVM_NONE) + end + + return IRQ_ADDR; + +endfunction : get_irq_addr + function void uvme_cva6_cfg_c::set_unsupported_csr_mask(); super.set_unsupported_csr_mask(); - // Remove unsupported CSRs for Embedded configuration - unsupported_csr_mask[uvma_core_cntrl_pkg::MCOUNTINHIBIT] = 1; - // Add supported CSRs for Embedded configuration for (int i = 0; i < MAX_NUM_HPMCOUNTERS; i++) begin unsupported_csr_mask[uvma_core_cntrl_pkg::MHPMEVENT3+i] = 0; @@ -368,4 +374,12 @@ function void uvme_cva6_cfg_c::set_unsupported_csr_mask(); endfunction : set_unsupported_csr_mask +function void uvme_cva6_cfg_c::read_disable_csr_check_plusargs(); + + super.read_disable_csr_check_plusargs(); + if (force_disable_csr_checks) + disable_all_csr_checks = 1; + +endfunction : read_disable_csr_check_plusargs + `endif // __UVME_CVA6_CFG_SV__ diff --git a/verif/env/uvme/uvme_cva6_cntxt.sv b/verif/env/uvme/uvme_cva6_cntxt.sv index 0ad5a5bfcb..3d07cae042 100644 --- a/verif/env/uvme/uvme_cva6_cntxt.sv +++ b/verif/env/uvme/uvme_cva6_cntxt.sv @@ -31,7 +31,6 @@ class uvme_cva6_cntxt_c extends uvm_object; // Agent context handles uvma_clknrst_cntxt_c clknrst_cntxt; - uvma_cvxif_cntxt_c cvxif_cntxt; uvma_axi_cntxt_c axi_cntxt; uvma_cva6_core_cntrl_cntxt_c core_cntrl_cntxt; uvma_rvfi_cntxt_c rvfi_cntxt; @@ -40,6 +39,9 @@ class uvme_cva6_cntxt_c extends uvm_object; // Memory modelling rand uvml_mem_cva6 mem; + // Handle to debug_req interface + virtual uvma_debug_if debug_vif; + // Events uvm_event sample_cfg_e; uvm_event sample_cntxt_e; diff --git a/verif/env/uvme/uvme_cva6_constants.sv b/verif/env/uvme/uvme_cva6_constants.sv index c77ec83746..4ee516292c 100644 --- a/verif/env/uvme/uvme_cva6_constants.sv +++ b/verif/env/uvme/uvme_cva6_constants.sv @@ -21,10 +21,38 @@ `define __UVME_CVA6_CONSTANTS_SV__ -parameter uvme_cva6_sys_default_clk_period = 1_500; // 10ns +parameter uvme_cva6_sys_default_clk_period = 10_000; // 10ns parameter uvme_cva6_debug_default_clk_period = 10_000; // 10ns parameter XLEN = 32; parameter ILEN = 32; + +// Control how often to print core scoreboard checked heartbeat messages +parameter PC_CHECKED_HEARTBEAT = 10_000; + +// Map the virtual peripheral registers +parameter CV_VP_REGISTER_BASE = 32'h8080_0000; +parameter CV_VP_REGISTER_SIZE = 32'h0000_1000; + +parameter CV_VP_VIRTUAL_PRINTER_OFFSET = 32'h0000_0000; +parameter CV_VP_RANDOM_NUM_OFFSET = 32'h0000_0040; +parameter CV_VP_CYCLE_COUNTER_OFFSET = 32'h0000_0080; +parameter CV_VP_STATUS_FLAGS_OFFSET = 32'h0000_00c0; +parameter CV_VP_FENCEI_TAMPER_OFFSET = 32'h0000_0100; +parameter CV_VP_INTR_TIMER_OFFSET = 32'h0000_0140; +parameter CV_VP_DEBUG_CONTROL_OFFSET = 32'h0000_0180; +parameter CV_VP_OBI_SLV_RESP_OFFSET = 32'h0000_01c0; +parameter CV_VP_SIG_WRITER_OFFSET = 32'h0000_0200; + +parameter CV_VP_VIRTUAL_PRINTER_BASE = CV_VP_REGISTER_BASE + CV_VP_VIRTUAL_PRINTER_OFFSET; +parameter CV_VP_RANDOM_NUM_BASE = CV_VP_REGISTER_BASE + CV_VP_RANDOM_NUM_OFFSET; +parameter CV_VP_CYCLE_COUNTER_BASE = CV_VP_REGISTER_BASE + CV_VP_CYCLE_COUNTER_OFFSET; +parameter CV_VP_STATUS_FLAGS_BASE = CV_VP_REGISTER_BASE + CV_VP_STATUS_FLAGS_OFFSET; +parameter CV_VP_INTR_TIMER_BASE = CV_VP_REGISTER_BASE + CV_VP_INTR_TIMER_OFFSET; +parameter CV_VP_DEBUG_CONTROL_BASE = CV_VP_REGISTER_BASE + CV_VP_DEBUG_CONTROL_OFFSET; +parameter CV_VP_OBI_SLV_RESP_BASE = CV_VP_REGISTER_BASE + CV_VP_OBI_SLV_RESP_OFFSET; +parameter CV_VP_SIG_WRITER_BASE = CV_VP_REGISTER_BASE + CV_VP_SIG_WRITER_OFFSET; +parameter CV_VP_FENCEI_TAMPER_BASE = CV_VP_REGISTER_BASE + CV_VP_FENCEI_TAMPER_OFFSET; + `endif // __UVME_CVA6_CONSTANTS_SV__ diff --git a/verif/env/uvme/uvme_cva6_env.sv b/verif/env/uvme/uvme_cva6_env.sv index 8a25ef2de3..b17246f0d2 100644 --- a/verif/env/uvme/uvme_cva6_env.sv +++ b/verif/env/uvme/uvme_cva6_env.sv @@ -42,7 +42,6 @@ class uvme_cva6_env_c extends uvm_env; // Agents uvma_clknrst_agent_c clknrst_agent; - uvma_cvxif_agent_c cvxif_agent; uvma_axi_agent_c axi_agent; uvma_cva6_core_cntrl_agent_c core_cntrl_agent; uvma_rvfi_agent_c#(ILEN,XLEN) rvfi_agent; @@ -52,6 +51,9 @@ class uvme_cva6_env_c extends uvm_env; // Handle to agent switch interface virtual uvmt_axi_switch_intf axi_switch_vif; + // Handle to debug_req interface + virtual uvma_debug_if debug_vif; + //CSR register model cva6_csr_reg_block csr_reg_block; cva6_csr_reg_adapter csr_reg_adapter; @@ -161,16 +163,10 @@ function void uvme_cva6_env_c::build_phase(uvm_phase phase); `uvm_fatal("CFG", "Configuration handle is null") end else begin - `uvm_info("CFG", $sformatf("Found configuration handle:\n%s", cfg.sprint()), UVM_DEBUG) + `uvm_info("CFG", $sformatf("Found configuration handle:\n%s", cfg.sprint()), UVM_NONE) end - void'(uvm_config_db#(cva6_cfg_t)::get(this, "", "CVA6Cfg", cfg.CVA6Cfg)); - if (!cfg.CVA6Cfg) begin - `uvm_fatal("CVA6Cfg", "RTL Configuration handle is null") - end - else begin - `uvm_info("CVA6Cfg", $sformatf("Found RTL configuration handle:\n%p", cfg.CVA6Cfg), UVM_DEBUG) - end + cfg.rvfi_cfg.nret = RTLCVA6Cfg.NrCommitPorts; if (cfg.enabled) begin void'(uvm_config_db#(uvme_cva6_cntxt_c)::get(this, "", "cntxt", cntxt)); @@ -179,7 +175,10 @@ function void uvme_cva6_env_c::build_phase(uvm_phase phase); cntxt = uvme_cva6_cntxt_c::type_id::create("cntxt"); end - cntxt.axi_cntxt.mem = cntxt.mem; + cntxt.axi_cntxt.mem = cntxt.mem; + cntxt.interrupt_cntxt.mem = cntxt.mem; + // get irq_addr ack from CVA6 UVM env + cfg.interrupt_cfg.irq_addr = cfg.get_irq_addr(); if ($test$plusargs("tandem_enabled")) $value$plusargs("tandem_enabled=%b",cfg.tandem_enabled); @@ -227,7 +226,8 @@ function void uvme_cva6_env_c::connect_phase(uvm_phase phase); csr_reg_predictor.map = csr_reg_block.default_map; csr_reg_predictor.adapter = csr_reg_adapter; csr_reg_block.default_map.set_auto_predict(0); - isacov_agent.monitor.ap.connect(csr_reg_predictor.bus_in); + if (cfg.cov_model_enabled) + isacov_agent.monitor.ap.connect(csr_reg_predictor.bus_in); end endfunction: connect_phase @@ -246,8 +246,6 @@ function void uvme_cva6_env_c::assign_cfg(); uvm_config_db#(uvma_clknrst_cfg_c)::set(this, "*clknrst_agent", "cfg", cfg.clknrst_cfg); - uvm_config_db#(uvma_cvxif_cfg_c)::set(this, "*cvxif_agent", "cfg", cfg.cvxif_cfg); - uvm_config_db#(uvma_axi_cfg_c)::set(this, "*axi_agent", "cfg", cfg.axi_cfg); uvm_config_db#(uvma_core_cntrl_cfg_c)::set(this, "core_cntrl_agent", "cfg", cfg); @@ -278,7 +276,6 @@ endfunction: assign_cntxt function void uvme_cva6_env_c::create_agents(); clknrst_agent = uvma_clknrst_agent_c::type_id::create("clknrst_agent", this); - cvxif_agent = uvma_cvxif_agent_c::type_id::create("cvxif_agent", this); axi_agent = uvma_axi_agent_c::type_id::create("axi_agent", this); core_cntrl_agent = uvma_cva6_core_cntrl_agent_c::type_id::create("core_cntrl_agent", this); rvfi_agent = uvma_rvfi_agent_c#(ILEN,XLEN)::type_id::create("rvfi_agent", this); @@ -296,6 +293,9 @@ function void uvme_cva6_env_c::create_env_components(); if (cfg.scoreboard_enabled) begin predictor = uvme_cva6_prd_c::type_id::create("predictor", this); + end + + if (cfg.scoreboard_enabled || cfg.tandem_enabled) begin sb = uvme_cva6_sb_c ::type_id::create("sb" , this); end @@ -327,6 +327,13 @@ function void uvme_cva6_env_c::retrieve_vif(); axi_switch_vif.active <= 1; end + if (!uvm_config_db#(virtual uvma_debug_if)::get(this, "", "debug_vif", debug_vif)) begin + `uvm_fatal("VIF", $sformatf("Could not find vif handle of type %s in uvm_config_db", $typename(debug_vif))) + end + else begin + cntxt.debug_vif = debug_vif; + `uvm_info("VIF", $sformatf("Found vif handle of type %s in uvm_config_db", $typename(debug_vif)), UVM_DEBUG) + end endfunction : retrieve_vif function void uvme_cva6_env_c::connect_predictor(); @@ -361,7 +368,6 @@ endfunction: connect_scoreboard function void uvme_cva6_env_c::assemble_vsequencer(); vsequencer.clknrst_sequencer = clknrst_agent.sequencer; - vsequencer.cvxif_vsequencer = cvxif_agent.vsequencer; vsequencer.axi_vsequencer = axi_agent.vsequencer; vsequencer.interrupt_sequencer = interrupt_agent.sequencer; @@ -372,12 +378,6 @@ task uvme_cva6_env_c::run_phase(uvm_phase phase); fork - begin - uvme_cvxif_vseq_c cvxif_vseq; - cvxif_vseq = uvme_cvxif_vseq_c::type_id::create("cvxif_vseq"); - cvxif_vseq.start(cvxif_agent.vsequencer); - end - begin if(cfg.axi_cfg.is_active == UVM_ACTIVE) begin uvma_axi_vseq_c axi_vseq; @@ -398,28 +398,25 @@ endtask function void uvme_cva6_env_c::connect_coverage_model(); - if (cfg.cvxif_cfg.cov_model_enabled) begin - cvxif_agent.monitor.req_ap.connect(cov_model.cvxif_covg.req_item_fifo.analysis_export); - end if (cfg.isacov_cfg.cov_model_enabled) begin isacov_agent.monitor.ap.connect(cov_model.isa_covg.mon_trn_fifo.analysis_export); isacov_agent.monitor.ap.connect(cov_model.illegal_covg.mon_trn_fifo.analysis_export); isacov_agent.monitor.ap.connect(cov_model.exception_covg.mon_trn_fifo.analysis_export); + rvfi_agent.rvfi_core_ap.connect(isacov_agent.monitor.rvfi_instr_imp); end clknrst_agent.mon_ap.connect(cov_model.reset_export); - rvfi_agent.rvfi_core_ap.connect(isacov_agent.monitor.rvfi_instr_imp); if(cfg.axi_cfg.cov_model_enabled) begin - axi_agent.monitor.m_axi_superset_write_rsp_packets_collected.connect(cov_model.axi_covg.uvme_axi_cov_b_resp_fifo.analysis_export); - axi_agent.monitor.m_axi_superset_read_rsp_packets_collected .connect(cov_model.axi_covg.uvme_axi_cov_r_resp_fifo.analysis_export); - axi_agent.monitor.m_axi_superset_read_req_packets_collected .connect(cov_model.axi_covg.uvme_axi_cov_ar_req_fifo.analysis_export); - axi_agent.monitor.m_axi_superset_write_req_packets_collected.connect(cov_model.axi_covg.uvme_axi_cov_aw_req_fifo.analysis_export); - - axi_agent.monitor.m_axi_superset_write_rsp_packets_collected.connect(cov_model.axi_ext_covg.uvme_axi_cov_b_resp_fifo.analysis_export); - axi_agent.monitor.m_axi_superset_read_rsp_packets_collected . connect(cov_model.axi_ext_covg.uvme_axi_cov_r_resp_fifo.analysis_export); - axi_agent.monitor.m_axi_superset_read_req_packets_collected .connect(cov_model.axi_ext_covg.uvme_axi_cov_ar_req_fifo.analysis_export); - axi_agent.monitor.m_axi_superset_write_req_packets_collected.connect(cov_model.axi_ext_covg.uvme_axi_cov_aw_req_fifo.analysis_export); + axi_agent.monitor.m_uvma_axi_write_rsp_packets_collected.connect(cov_model.axi_covg.uvme_axi_cov_b_resp_fifo.analysis_export); + axi_agent.monitor.m_uvma_axi_read_rsp_packets_collected .connect(cov_model.axi_covg.uvme_axi_cov_r_resp_fifo.analysis_export); + axi_agent.monitor.m_uvma_axi_read_req_packets_collected .connect(cov_model.axi_covg.uvme_axi_cov_ar_req_fifo.analysis_export); + axi_agent.monitor.m_uvma_axi_write_req_packets_collected.connect(cov_model.axi_covg.uvme_axi_cov_aw_req_fifo.analysis_export); + + axi_agent.monitor.m_uvma_axi_write_rsp_packets_collected.connect(cov_model.axi_ext_covg.uvme_axi_cov_b_resp_fifo.analysis_export); + axi_agent.monitor.m_uvma_axi_read_rsp_packets_collected . connect(cov_model.axi_ext_covg.uvme_axi_cov_r_resp_fifo.analysis_export); + axi_agent.monitor.m_uvma_axi_read_req_packets_collected .connect(cov_model.axi_ext_covg.uvme_axi_cov_ar_req_fifo.analysis_export); + axi_agent.monitor.m_uvma_axi_write_req_packets_collected.connect(cov_model.axi_ext_covg.uvme_axi_cov_aw_req_fifo.analysis_export); end if(cfg.interrupt_cfg.cov_model_enabled) begin diff --git a/verif/env/uvme/uvme_cva6_pkg.flist b/verif/env/uvme/uvme_cva6_pkg.flist index 4d8cf4274a..a4cb9fe372 100644 --- a/verif/env/uvme/uvme_cva6_pkg.flist +++ b/verif/env/uvme/uvme_cva6_pkg.flist @@ -21,7 +21,6 @@ +incdir+${CVA6_UVME_PATH} +incdir+${CVA6_UVME_PATH}/cov +incdir+${CVA6_UVME_PATH}/vseq -+incdir+${CVA6_UVME_PATH}/cvxif_vseq +incdir+${CVA6_UVME_PATH}/uvma_interrupt // Files diff --git a/verif/env/uvme/uvme_cva6_pkg.sv b/verif/env/uvme/uvme_cva6_pkg.sv index 3093b6c94b..68b978bac9 100644 --- a/verif/env/uvme/uvme_cva6_pkg.sv +++ b/verif/env/uvme/uvme_cva6_pkg.sv @@ -30,7 +30,6 @@ `include "uvml_mem_macros.sv" `include "uvma_axi_macros.sv" `include "uvma_clknrst_macros.sv" -`include "uvma_cvxif_macros.sv" `include "uvma_isacov_macros.sv" `include "uvme_cva6_macros.sv" @@ -46,7 +45,6 @@ package uvme_cva6_pkg; import uvml_sb_pkg ::*; import uvml_trn_pkg ::*; import uvma_clknrst_pkg::*; - import uvma_cvxif_pkg::*; import uvma_axi_pkg::*; import uvml_mem_pkg ::*; import uvma_core_cntrl_pkg::*; @@ -60,6 +58,7 @@ package uvme_cva6_pkg; import "DPI-C" function void read_elf(input string filename); import "DPI-C" function byte get_section(output longint address, output longint len); import "DPI-C" context function void read_section_sv(input longint address, inout byte buffer[]); + import "DPI-C" function byte read_symbol (input string symbol_name, inout longint unsigned address); // Default legal opcode and funct7 for RV32I instructions bit [6:0] legal_i_opcode[$] = '{7'b0000011, @@ -77,10 +76,14 @@ package uvme_cva6_pkg; bit [6:0] legal_i_funct7[$] = '{7'b0000000, 7'b0100000}; + parameter config_pkg::cva6_cfg_t RTLCVA6Cfg = build_config_pkg::build_config(cva6_config_pkg::cva6_cfg); + // Constants / Structs / Enums `include "uvme_cva6_constants.sv" `include "uvme_cva6_tdefs.sv" + `include "uvml_mem_vp.sv" + // Objects `include "uvma_cva6_core_cntrl_cntxt.sv" `include "uvme_cva6_cfg.sv" @@ -100,7 +103,6 @@ package uvme_cva6_pkg; `include "uvma_cva6_core_cntrl_agent.sv" `include "uvme_cva6_sb.sv" `include "uvme_cva6_vsqr.sv" - `include "uvme_cvxif_covg.sv" `include "uvme_isa_covg.sv" `include "uvme_illegal_instr_covg.sv" `include "uvme_exception_covg.sv" diff --git a/verif/env/uvme/uvme_cva6_sb.sv b/verif/env/uvme/uvme_cva6_sb.sv index a311183c52..ddc7eeeca3 100644 --- a/verif/env/uvme/uvme_cva6_sb.sv +++ b/verif/env/uvme/uvme_cva6_sb.sv @@ -46,22 +46,42 @@ class uvme_cva6_sb_c extends uvm_scoreboard; uvma_isacov_instr_c instr_prev; // Store MTVEC value - bit [XLEN-1:0] mtvec_value = 'h0; + bit [XLEN-1:0] mtvec_value = 0; // Store MEPC value bit [XLEN-1:0] mepc_value; // Store trap pc value bit [XLEN-1:0] trap_pc; + bit [XLEN:0] mcycle_update; // Flag to see if mtvec/mepc has been changed - bit mtvec_change = 'h0; - bit mepc_change = 'h0; + bit mtvec_change = 0; + bit mepc_change = 0; static bit has_trap = 0; // Flag for compressed instruction bit trap_is_compressed; + bit signed [XLEN-1:0] cycle; + bit signed [XLEN-1:0] cycleh; + bit signed [XLEN-1:0] write_cycle; + bit signed [XLEN-1:0] read_cycle; + + // can't initialize it by 0 because it's a legal value + bit unsigned [XLEN-1:0] mcycle_read = 'hx; + bit unsigned [XLEN-1:0] mcycle_prev; + + // can't initialize it by 0 because it's a legal value + bit unsigned [XLEN-1:0] mcycleh_read = 'hx; + bit unsigned [XLEN-1:0] mcycleh_prev; + + int mcycle_value; + bit mcycle_change = 0; + bit overflow = 0; + bit write_in_mcycle; + bit write_in_mcycleh; + `uvm_component_utils_begin(uvme_cva6_sb_c) `uvm_field_object(cfg , UVM_DEFAULT) `uvm_field_object(cntxt, UVM_DEFAULT) @@ -110,6 +130,11 @@ class uvme_cva6_sb_c extends uvm_scoreboard; */ extern virtual function void check_mepc(uvma_isacov_instr_c instr); + /** + * Check mcycle[h] CSRs + */ + extern virtual function bit [XLEN:0] check_mcycle_h(uvma_isacov_instr_c instr, uvma_isacov_instr_c instr_prev, int cycle_count); + /** * Creates sub-scoreboard components. */ @@ -215,12 +240,18 @@ function void uvme_cva6_sb_c::check_pc_trap(uvma_isacov_instr_c instr, if (instr_prev != null) begin if (instr_prev.trap) begin if (mtvec_change) begin - if (instr.rvfi.pc_rdata[31:2] == mtvec_value[31:2]) begin - //we only support MTVEC Direct mode - `uvm_info(get_type_name(), $sformatf("After a trap, PC matches MTVEC value"), UVM_DEBUG) + if(cfg.xlen == 32) begin + if (instr.rvfi.pc_rdata[31:2] == mtvec_value[31:2]) begin + //we only support MTVEC Direct mode + `uvm_info(get_type_name(), $sformatf("After a trap, PC matches MTVEC value"), UVM_DEBUG) + end + else begin + `uvm_fatal(get_type_name(), "ERROR -> Doesn't jump to MTVEC") + end end else begin - `uvm_fatal(get_type_name(), "ERROR -> Doesn't jump to MTVEC") + //TODO add 64 bit configuration support + `uvm_info(get_type_name(), $sformatf("We only support 32 bit configuration"), UVM_DEBUG) end end else begin @@ -237,7 +268,7 @@ function void uvme_cva6_sb_c::check_mepc(uvma_isacov_instr_c instr); if (instr.trap) begin trap_pc = instr.rvfi.pc_rdata[31:0]; - `uvm_info(get_type_name(), $sformatf("Trap PC : 0x%h ", trap_pc), UVM_NONE) + `uvm_info(get_type_name(), $sformatf("Trap PC : 0x%h ", trap_pc), UVM_DEBUG) if (instr.rvfi.insn[1:0] == 2'h3) begin trap_is_compressed = 1'h0; end @@ -272,34 +303,199 @@ function void uvme_cva6_sb_c::check_mepc(uvma_isacov_instr_c instr); `uvm_info(get_type_name(), $sformatf("Trap is compressed ? : %h ", trap_is_compressed), UVM_DEBUG) if (trap_is_compressed) begin if (mepc_value != trap_pc + 'h2) begin - `uvm_warning(get_type_name(), $sformatf("BE CAREFUL -> MEPC hasn't the next instruction's PC 2")) + `uvm_info(get_type_name(), $sformatf("BE CAREFUL -> MEPC hasn't the next instruction's PC 2"), UVM_DEBUG) end end else begin if (mepc_value != trap_pc + 'h4) begin - `uvm_warning(get_type_name(), $sformatf("BE CAREFUL -> MEPC hasn't the next instruction's PC 4")) + `uvm_info(get_type_name(), $sformatf("BE CAREFUL -> MEPC hasn't the next instruction's PC 4"), UVM_DEBUG) end end end else begin - `uvm_warning(get_type_name(), $sformatf("BE CAREFUL -> MEPC still has the trap pc, this could create an infinite loop ")) + `uvm_info(get_type_name(), $sformatf("BE CAREFUL -> MEPC still has the trap pc, this could create an infinite loop if the trap has been raised by an exception"), UVM_DEBUG) end end end endfunction : check_mepc +function bit [XLEN:0] uvme_cva6_sb_c::check_mcycle_h(uvma_isacov_instr_c instr, uvma_isacov_instr_c instr_prev, int cycle_count); + + // Check mcycle value after a CSR read + if (instr_prev == null) return; + + write_in_mcycle = (instr_prev.is_csr_write() && instr_prev.csr_val == 12'hb00) ? 1 : 0; + if (cfg.xlen == 32) begin + write_in_mcycleh = (instr_prev.is_csr_write() && instr_prev.csr_val == 12'hb80) ? 1 : 0; + end + + mcycle_prev = mcycle_read; + mcycle_read = instr.rvfi.name_csrs["mcycle"].rdata; + // Check MCYCLE in a range because of a delay in the UVM RFVI agent while sending an instruction transaction + if (mcycle_read inside {cycle-2 , cycle-1}) begin + `uvm_info(get_type_name(), $sformatf("MCYCLE Match range [0x%h - 0x%h]",cycle-2, cycle-1), UVM_DEBUG) + end + else begin + `uvm_error(get_type_name(), $sformatf("ERROR : MCYCLE value out of range [0x%h - 0x%h]",cycle-2, cycle-1)) + end + + // Ignore check if there's no write into the MCYCLE + if (!write_in_mcycle) begin + // Check if the CSR is really incremented + if (mcycle_read > mcycle_prev) begin + `uvm_info(get_type_name(), $sformatf("MCYCLE is incremented !!"), UVM_DEBUG) + overflow = 0; + end + else if (mcycle_read < mcycle_prev) begin + `uvm_info(get_type_name(), $sformatf("MCYCLE overflow !!"), UVM_DEBUG) + cycleh += 1; + overflow = 1; + end + else begin + `uvm_error(get_type_name(), $sformatf("ERROR : No overflow - MCYCLE isn't incremented, %d - %d", mcycle_read, mcycle_prev)) + overflow = 0; + end + end + + // MCYCLEH is only supported in RV32 + if (cfg.xlen == 32) begin + mcycleh_prev = mcycleh_read; + mcycleh_read = instr.rvfi.name_csrs["mcycleh"].rdata; + // Check MCYCLEH only if there an overflow or a write into the MCYCLEH + if (overflow || write_in_mcycleh) begin + if (mcycleh_read == cycleh) begin + `uvm_info(get_type_name(), $sformatf("MCYCLEH Match value 0x%h", cycleh), UVM_DEBUG) + end + else begin + `uvm_error(get_type_name(), $sformatf("ERROR : MCYCLEH value didn't Match value 0x%h", cycleh)) + end + end + // Ignore check if there's no write into the MCYCLEH or an overflow of MCYCLE is present + if (!write_in_mcycleh && overflow) begin + if (mcycleh_read > mcycleh_prev) begin + `uvm_info(get_type_name(), $sformatf("MCYCLE overflow -> MCYCLEH is incremented !!"), UVM_DEBUG) + end + else if (mcycleh_read < mcycleh_prev) begin + `uvm_info(get_type_name(), $sformatf("MCYCLEH overflow !!"), UVM_DEBUG) + end + else begin + `uvm_error(get_type_name(), $sformatf("ERROR : No overflow - MCYCLEH isn't incremented")) + end + end + // Update the value after a write into the MCYCLEH + if ((instr.is_csr_write() && instr.csr_val == 12'hb80)) begin + if (instr.name == uvma_isacov_pkg::CSRRWI) begin + cycleh = instr.rs1; + `uvm_info(get_type_name(), $sformatf("Write into MCYCLEH the value = 0x%h", cycleh), UVM_DEBUG) + end + if (instr.name == uvma_isacov_pkg::CSRRSI) begin + cycleh = instr.rvfi.rd1_wdata | instr.rs1; + `uvm_info(get_type_name(), $sformatf("Write into MCYCLEH the value = 0x%h", cycleh), UVM_DEBUG) + end + if (instr.name == uvma_isacov_pkg::CSRRCI) begin + cycleh = instr.rvfi.rd1_wdata & ~(instr.rs1); + `uvm_info(get_type_name(), $sformatf("Write into MCYCLEH the value = 0x%h", cycleh), UVM_DEBUG) + end + if (instr.name == uvma_isacov_pkg::CSRRW) begin + cycleh = instr.rs1_value; + `uvm_info(get_type_name(), $sformatf("Write into MCYCLEH the value = 0x%h", cycleh), UVM_DEBUG) + end + if (instr.name == uvma_isacov_pkg::CSRRC) begin + cycleh = instr.rvfi.rd1_wdata & ~(instr.rs1_value); + `uvm_info(get_type_name(), $sformatf("Write into MCYCLEH the value = 0x%h", cycleh), UVM_DEBUG) + end + if (instr.name == uvma_isacov_pkg::CSRRS) begin + cycleh = instr.rvfi.rd1_wdata | instr.rs1_value; + `uvm_info(get_type_name(), $sformatf("Write into MCYCLEH the value = 0x%h", cycleh), UVM_DEBUG) + end + end + end + + // Update the counter value after a write into the MCYCLE + if (instr.group == uvma_isacov_pkg::CSR_GROUP && instr.is_csr_write() && instr.csr_val == 12'hb00) begin + if (instr.name == uvma_isacov_pkg::CSRRWI) begin + mcycle_value = instr.rs1; + mcycle_change = 1'h1; + `uvm_info(get_type_name(), $sformatf("Write into MCYCLE the value = 0x%h", mcycle_value), UVM_DEBUG) + end + if (instr.name == uvma_isacov_pkg::CSRRSI) begin + mcycle_value = instr.rvfi.rd1_wdata | instr.rs1; + mcycle_change = 1'h1; + `uvm_info(get_type_name(), $sformatf("Write into MCYCLE the value = 0x%h", mcycle_value), UVM_DEBUG) + end + if (instr.name == uvma_isacov_pkg::CSRRCI) begin + mcycle_value = instr.rvfi.rd1_wdata & ~(instr.rs1); + mcycle_change = 1'h1; + `uvm_info(get_type_name(), $sformatf("Write into MCYCLE the value = 0x%h", mcycle_value), UVM_DEBUG) + end + if (instr.name == uvma_isacov_pkg::CSRRW) begin + mcycle_value = instr.rs1_value; + mcycle_change = 1'h1; + `uvm_info(get_type_name(), $sformatf("Write into MCYCLE the value = 0x%h", mcycle_value), UVM_DEBUG) + end + if (instr.name == uvma_isacov_pkg::CSRRC) begin + mcycle_value = instr.rvfi.rd1_wdata & ~(instr.rs1_value); + mcycle_change = 1'h1; + `uvm_info(get_type_name(), $sformatf("Write into MCYCLE the value = 0x%h", mcycle_value), UVM_DEBUG) + end + if (instr.name == uvma_isacov_pkg::CSRRS) begin + mcycle_value = instr.rvfi.rd1_wdata | instr.rs1_value; + mcycle_change = 1'h1; + `uvm_info(get_type_name(), $sformatf("Write into MCYCLE the value = 0x%h", mcycle_value), UVM_DEBUG) + end + end + else begin + mcycle_change = 0; + end + + return {mcycle_change, mcycle_value}; + +endfunction : check_mcycle_h + task uvme_cva6_sb_c::run_phase(uvm_phase phase); super.run_phase(phase); - forever begin - instr_trn_fifo.get(instr_trn); - check_pc_trap(instr_trn.instr, instr_prev); - check_mepc(instr_trn.instr); - // Move instructions down the pipeline - instr_prev = instr_trn.instr; - end + if (cfg.scoreboard_enabled && cfg.disable_all_csr_checks) + `uvm_warning(get_type_name(),"Scoreboard enabled while config disable_all_csr_checks is true. Cycle and Trap will not be scoreboarded nor checked"); + + if (cfg.scoreboard_enabled && !cfg.disable_all_csr_checks) + fork + begin + forever begin + if (!cntxt.clknrst_cntxt.vif.reset_n) begin + cycle = 0; + end + else begin + if (mcycle_update[XLEN]) begin + read_cycle = write_cycle; + write_cycle = cycle; + cycle = mcycle_update + 2; + mcycle_update[XLEN] = 0; + end + else begin + read_cycle = write_cycle; + write_cycle = cycle; + cycle = cycle + 1; + end + end + @(posedge cntxt.clknrst_cntxt.vif.clk); + end + end + begin + forever begin + instr_trn_fifo.get(instr_trn); + check_pc_trap(instr_trn.instr, instr_prev); + check_mepc(instr_trn.instr); + if (instr_trn.instr.rvfi.nret_id == 0) begin + mcycle_update = check_mcycle_h(instr_trn.instr, instr_prev, read_cycle); + end + // Move instructions down the pipeline + instr_prev = instr_trn.instr; + end + end + join_none endtask : run_phase diff --git a/verif/env/uvme/uvme_cva6_vsqr.sv b/verif/env/uvme/uvme_cva6_vsqr.sv index aaef996f12..2b4464dc3e 100644 --- a/verif/env/uvme/uvme_cva6_vsqr.sv +++ b/verif/env/uvme/uvme_cva6_vsqr.sv @@ -36,7 +36,6 @@ class uvme_cva6_vsqr_c extends uvm_sequencer#( // Sequencer handles uvma_clknrst_sqr_c clknrst_sequencer; - uvma_cvxif_vsqr_c cvxif_vsequencer; uvma_axi_vsqr_c axi_vsequencer; uvma_interrupt_sqr_c interrupt_sequencer; diff --git a/verif/env/uvme/uvml_mem_vp.sv b/verif/env/uvme/uvml_mem_vp.sv new file mode 100644 index 0000000000..87691b87f6 --- /dev/null +++ b/verif/env/uvme/uvml_mem_vp.sv @@ -0,0 +1,190 @@ +// Copyright 2024 CoreLab Tech +// +// Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// https://solderpad.org/licenses/ +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0 + +`ifndef __UVML_MEM_VP_SV__ +`define __UVML_MEM_VP_SV__ + +typedef class uvme_cva6_cntxt_c; + +/** + * Memory model + */ +class uvml_mem_vp_c#(int XLEN=`UVML_MEM_XLEN) extends uvml_mem_c#(XLEN); + + int vp_log; + uvme_cva6_cntxt_c cntxt; + + `uvm_object_param_utils_begin(uvml_mem_vp_c#(XLEN)); + `uvm_field_object(cntxt, UVM_DEFAULT) + `uvm_object_utils_end + + /** + * Default constructor + */ + extern function new(string name="uvml_mem_vp_c"); + + /** + * Write to memory array + */ + extern virtual function void write(bit[XLEN-1:0] addr, reg[7:0] data); + + extern virtual function void post_write(bit[XLEN-1:0] addr, reg[7:0] data); + + /** + * Read from memory array in 32 bit. + */ + extern virtual function reg[31:0] read_word(bit[XLEN-1:0] addr); + + /** + * Start delayed debug thread + */ + extern virtual task debug(bit dbg_req_value, + bit request_mode, + bit rand_pulse_duration, + bit rand_start_delay, + int unsigned dbg_pulse_duration, + int unsigned start_delay); + + /** + * Wait for clocks + */ + extern virtual task wait_n_clocks(int unsigned n); + + /** + * Asserts the actual interrupt wires + */ + extern virtual task set_debug_req(bit debug_req); + +endclass : uvml_mem_vp_c + + +function uvml_mem_vp_c::new(string name="uvml_mem_vp_c"); + + super.new(name); + vp_log = $fopen("vp.log", "w"); + +endfunction : new + +function void uvml_mem_vp_c::write(bit[XLEN-1:0] addr, reg[7:0] data); + + super.write(addr, data); + + post_write(addr, data); + +endfunction : write + +function reg[31:0] uvml_mem_vp_c::read_word(bit[XLEN-1:0] addr); + return {read(addr+'h3), read(addr+'h2), read(addr+'h1), read(addr+'h0)}; +endfunction : read_word + +function void uvml_mem_vp_c::post_write(bit[XLEN-1:0] addr, reg[7:0] data); + reg[31:0] wval; + if ( addr==(CV_VP_DEBUG_CONTROL_BASE+32'h3) ) begin + wval = read_word(CV_VP_DEBUG_CONTROL_BASE); + `uvm_info("UVML_MEM_VP", $sformatf("Call to virtual peripheral 'vp_debug_control', wval=0x%0x", wval), UVM_HIGH) + debug(.dbg_req_value (wval[31]), + .request_mode (wval[30]), + .rand_pulse_duration (wval[29]), + .dbg_pulse_duration (wval[28:16]), + .rand_start_delay (wval[15]), + .start_delay (wval[14:0])); + end + else if ( addr==CV_VP_VIRTUAL_PRINTER_BASE ) begin + wval=0; + wval=data; + `uvm_info("VP_VSEQ", $sformatf("Call to virtual peripheral 'virtual_printer', wval=0x%0x", wval), UVM_HIGH) + // Allow $write as this acts as a UART/serial printer + // $write("%c", wval); + $fwrite(vp_log, $sformatf("%c", wval)); + end + else if ( addr==CV_VP_STATUS_FLAGS_BASE ) begin + wval = read_word(CV_VP_STATUS_FLAGS_BASE); + // TODO if (wval == 'd123456789) begin + // TODO `uvm_info("VP_VSEQ", "virtual peripheral: TEST PASSED", UVM_DEBUG) + // TODO cntxt.vp_status_vif.tests_passed = 1; + // TODO cntxt.vp_status_vif.exit_valid = 1; + // TODO cntxt.vp_status_vif.exit_value = 0; + // TODO end + // TODO else if (wval == 'd1) begin + // TODO cntxt.vp_status_vif.tests_failed = 1; + // TODO cntxt.vp_status_vif.exit_valid = 1; + // TODO cntxt.vp_status_vif.exit_value = 1; + // TODO end + end + else if ( addr==(CV_VP_STATUS_FLAGS_BASE+'h4) ) begin + wval = read_word(CV_VP_STATUS_FLAGS_BASE+'h4); + `uvm_info("VP_VSEQ", "virtual peripheral: END OF SIM", UVM_DEBUG) + // TODO cntxt.vp_status_vif.exit_valid = 1; + // TODO cntxt.vp_status_vif.exit_value = wval; + end + +endfunction : post_write + +task uvml_mem_vp_c::debug(bit dbg_req_value, + bit request_mode, + bit rand_pulse_duration, + bit rand_start_delay, + int unsigned dbg_pulse_duration, + int unsigned start_delay); + fork + begin + if(!uvm_config_db#(uvme_cva6_cntxt_c)::get(uvm_root::get(), "uvm_test_top.env", "cntxt", this.cntxt)) begin + `uvm_fatal("UVML_MEM_VP", "cva6 cntxt object handle not found") + end + cntxt.debug_vif.is_active = 1; + if (rand_start_delay) begin + wait_n_clocks($urandom_range(start_delay, 0)); + end + else begin + wait_n_clocks(start_delay); + end + + if (request_mode) begin + set_debug_req(dbg_req_value); + + if (rand_pulse_duration) begin + if (dbg_pulse_duration == 0) + wait_n_clocks($urandom_range(128,1)); + else + wait_n_clocks($urandom_range(dbg_pulse_duration, 1)); + end + else begin + wait_n_clocks(dbg_pulse_duration); + end + set_debug_req(!dbg_req_value); + end + else begin + set_debug_req(dbg_req_value); + end + end + join_none + +endtask : debug + +task uvml_mem_vp_c::wait_n_clocks(int unsigned n); + + repeat (n) @(cntxt.debug_vif.mon_cb); + +endtask : wait_n_clocks + +task uvml_mem_vp_c::set_debug_req(bit debug_req); + + cntxt.debug_vif.drv_cb.debug_drv <= debug_req; + +endtask : set_debug_req + +`endif // __UVML_MEM_VP_SV__ + diff --git a/verif/env/uvme/vseq/uvme_cva6_reset_vseq.sv b/verif/env/uvme/vseq/uvme_cva6_reset_vseq.sv index ffa34503ab..4abca6f864 100644 --- a/verif/env/uvme/vseq/uvme_cva6_reset_vseq.sv +++ b/verif/env/uvme/vseq/uvme_cva6_reset_vseq.sv @@ -41,8 +41,8 @@ class uvme_cva6_reset_vseq_c extends uvme_cva6_base_vseq_c; constraint defaults_cons { soft num_clk_before_reset == 50; - soft rst_deassert_period == 7_400; // 7.4 ns - soft post_rst_wait == 7_400; // 7.4 ns + soft rst_deassert_period == 20_000; // 20 ns + soft post_rst_wait == 20_000; // 20 ns } diff --git a/verif/regress/benchmark.sh b/verif/regress/benchmark.sh index 3e5bf57e5d..282bafdec9 100644 --- a/verif/regress/benchmark.sh +++ b/verif/regress/benchmark.sh @@ -13,16 +13,18 @@ if [ -z "$RISCV" ]; then return fi +if [ -z "$DV_SIMULATORS" ]; then + DV_SIMULATORS=veri-testharness,spike +fi + # install the required tools -source ./verif/regress/install-verilator.sh +if [[ "$DV_SIMULATORS" == *"veri-testharness"* ]]; then + source ./verif/regress/install-verilator.sh +fi source ./verif/regress/install-spike.sh source ./verif/sim/setup-env.sh -if [ -z "$DV_SIMULATORS" ]; then - DV_SIMULATORS=veri-testharness,spike -fi - if [ -z "$DV_TARGET" ]; then DV_TARGET=cv64a6_imafdc_sv39 fi @@ -30,7 +32,7 @@ fi cd verif/sim/ BDIR=../tests/riscv-tests/benchmarks/ -CVA6_FLAGS="--target $DV_TARGET --iss=$DV_SIMULATORS --iss_yaml cva6.yaml --linker ../tests/custom/common/test.ld" +CVA6_FLAGS="--target $DV_TARGET --iss=$DV_SIMULATORS --iss_yaml cva6.yaml --linker ../../config/gen_from_riscv_config/linker/link.ld" GCC_COMMON_SRC=( ../tests/custom/common/syscalls.c diff --git a/verif/regress/coremark.sh b/verif/regress/coremark.sh index 3103d00b3e..b21ec7e86b 100644 --- a/verif/regress/coremark.sh +++ b/verif/regress/coremark.sh @@ -18,28 +18,34 @@ if ! [ -n "$RISCV" ]; then return fi +if ! [ -n "$DV_SIMULATORS" ]; then + DV_SIMULATORS=vcs-uvm +fi + # install the required tools -source ./verif/regress/install-verilator.sh +if [[ "$DV_SIMULATORS" == *"veri-testharness"* ]]; then + source ./verif/regress/install-verilator.sh +fi source ./verif/regress/install-spike.sh -source verif/regress/install-riscv-compliance.sh -source verif/regress/install-riscv-tests.sh source ./verif/sim/setup-env.sh -if ! [ -n "$DV_SIMULATORS" ]; then - DV_SIMULATORS=veri-testharness +if ! [ -n "$DV_HWCONFIG_OPTS" ]; then + DV_HWCONFIG_OPTS="cv32a65x" fi if ! [ -n "$UVM_VERBOSITY" ]; then export UVM_VERBOSITY=UVM_NONE fi +export DV_OPTS="$DV_OPTS --issrun_opts=+tb_performance_mode+debug_disable=1+UVM_VERBOSITY=$UVM_VERBOSITY" + make clean make -C verif/sim clean_all cd verif/sim/ -src0=../tests/custom/coremark/core_main.c +src0=../tests/custom/coremark/coremark_main.c srcA=( ../tests/custom/coremark/uart.c ../tests/custom/coremark/core_list_join.c @@ -70,7 +76,7 @@ cflags_opt=( cflags=( "${cflags_opt[@]}" "-DCOMPILER_FLAGS='\"${cflags_opt[*]}\"'" - -DITERATIONS=2 + -DITERATIONS=4 -DPERFORMANCE_RUN -DSKIP_TIME_CHECK -I../tests/custom/env @@ -78,17 +84,14 @@ cflags=( -DNOPRINT ) -default_config="cv32a65x" isa="rv32imc_zba_zbb_zbc_zbs" -set -x python3 cva6.py \ --target hwconfig \ - --isa "$isa" \ - --hwconfig_opts="$default_config" \ + --hwconfig_opts="$DV_HWCONFIG_OPTS" \ --iss="$DV_SIMULATORS" \ --iss_yaml=cva6.yaml \ --c_tests "$src0" \ --gcc_opts "${srcA[*]} ${cflags[*]}" \ - --linker ../tests/custom/common/test.ld \ + --iss_timeout=2000 \ $DV_OPTS diff --git a/verif/regress/cv32a6_tests.sh b/verif/regress/cv32a6_tests.sh index 720c2734b6..0af6cc0231 100644 --- a/verif/regress/cv32a6_tests.sh +++ b/verif/regress/cv32a6_tests.sh @@ -14,25 +14,24 @@ if ! [ -n "$RISCV" ]; then return fi +if ! [ -n "$DV_SIMULATORS" ]; then + DV_SIMULATORS=vcs-testharness,spike +fi # install the required tools -source ./verif/regress/install-verilator.sh +if [[ "$DV_SIMULATORS" == *"veri-testharness"* ]]; then + source ./verif/regress/install-verilator.sh +fi source ./verif/regress/install-spike.sh # install the required test suites -source ./verif/regress/install-riscv-compliance.sh source ./verif/regress/install-riscv-tests.sh -source ./verif/regress/install-riscv-arch-test.sh # setup sim env source ./verif/sim/setup-env.sh echo "$SPIKE_INSTALL_DIR$" -if ! [ -n "$DV_SIMULATORS" ]; then - DV_SIMULATORS=vcs-testharness,spike -fi - if ! [ -n "$DV_TARGET" ]; then DV_TARGET=cv32a65x fi @@ -60,7 +59,7 @@ for t in ${riscv_tests_list[@]} ; do [[ $? > 0 ]] && ((errors++)) done -python3 cva6.py --target ${DV_TARGET} --iss=$DV_SIMULATORS --iss_yaml=cva6.yaml --c_tests ../tests/custom/hello_world/hello_world.c --linker=../tests/custom/common/test.ld\ +python3 cva6.py --target ${DV_TARGET} --iss=$DV_SIMULATORS --iss_yaml=cva6.yaml --c_tests ../tests/custom/hello_world/hello_world.c --linker=../../config/gen_from_riscv_config/linker/link.ld\ --gcc_opts="-static -mcmodel=medany -fvisibility=hidden -nostdlib -nostartfiles -g ../tests/custom/common/syscalls.c ../tests/custom/common/crt.S -lgcc -I../tests/custom/env -I../tests/custom/common" $DV_OPTS [[ $? > 0 ]] && ((errors++)) diff --git a/verif/regress/cv64a6_imafdc_tests.sh b/verif/regress/cv64a6_imafdc_tests.sh index a24794b086..e5164f40e6 100644 --- a/verif/regress/cv64a6_imafdc_tests.sh +++ b/verif/regress/cv64a6_imafdc_tests.sh @@ -17,25 +17,24 @@ if ! [ -n "$RISCV" ]; then return fi +if ! [ -n "$DV_SIMULATORS" ]; then + DV_SIMULATORS=vcs-testharness,spike +fi # install the required tools -source ./verif/regress/install-verilator.sh +if [[ "$DV_SIMULATORS" == *"veri-testharness"* ]]; then + source ./verif/regress/install-verilator.sh +fi source ./verif/regress/install-spike.sh # install the required test suites -source ./verif/regress/install-riscv-compliance.sh source ./verif/regress/install-riscv-tests.sh -source ./verif/regress/install-riscv-arch-test.sh # setup sim env source ./verif/sim/setup-env.sh echo "$SPIKE_INSTALL_DIR$" -if ! [ -n "$DV_SIMULATORS" ]; then - DV_SIMULATORS=vcs-testharness,spike -fi - if ! [ -n "$DV_TARGET" ]; then DV_TARGET=cv64a6_imafdc_sv39_hpdcache fi @@ -64,7 +63,7 @@ for t in ${riscv_tests_list[@]} ; do done python3 cva6.py --target ${DV_TARGET} --iss=$DV_SIMULATORS --iss_yaml=cva6.yaml --c_tests ../tests/custom/hello_world/hello_world.c \ - --gcc_opts="-static -mcmodel=medany -fvisibility=hidden -nostdlib -nostartfiles -g ../tests/custom/common/syscalls.c ../tests/custom/common/crt.S -I../tests/custom/env -I../tests/custom/common -T ../tests/custom/common/test.ld" $DV_OPTS + --gcc_opts="-static -mcmodel=medany -fvisibility=hidden -nostdlib -nostartfiles -g ../tests/custom/common/syscalls.c ../tests/custom/common/crt.S -I../tests/custom/env -I../tests/custom/common -T ../../config/gen_from_riscv_config/linker/link.ld" $DV_OPTS [[ $? > 0 ]] && ((errors++)) make -C ../.. clean diff --git a/verif/regress/cvxif_verif_regression.sh b/verif/regress/cvxif_verif_regression.sh new file mode 100644 index 0000000000..4d7b9e6503 --- /dev/null +++ b/verif/regress/cvxif_verif_regression.sh @@ -0,0 +1,50 @@ +# Copyright 2024 Thales DIS France SAS +# +# Licensed under the Solderpad Hardware Licence, Version 2.0 (the License); +# you may not use this file except in compliance with the License. +# SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0 +# You may obtain a copy of the License at https://solderpad.org/licenses/ +# +# Original Author: Guillaume Chauvon + + +# where are the tools +if ! [ -n "$RISCV" ]; then + echo "Error: RISCV variable undefined" + return +fi + +if ! [ -n "$DV_SIMULATORS" ]; then + DV_SIMULATORS=vcs-testharness,spike +fi + +# install the required tools +if [[ "$DV_SIMULATORS" == *"veri-testharness"* ]]; then + source ./verif/regress/install-verilator.sh +fi +source ./verif/regress/install-spike.sh + +# setup sim env +source ./verif/sim/setup-env.sh + +echo "$SPIKE_INSTALL_DIR$" + +if ! [ -n "$UVM_VERBOSITY" ]; then + export UVM_VERBOSITY=UVM_NONE +fi + +export cvxif=1 # For CVXIF in Spike +export DV_OPTS="$DV_OPTS --issrun_opts=+debug_disable=1+UVM_VERBOSITY=$UVM_VERBOSITY" + + +cd verif/sim/ +make -C ../.. clean +make clean_all +python3 cva6.py --testlist=../tests/testlist_cvxif.yaml --test cvxif_add_nop --iss_yaml cva6.yaml --target cv64a6_imafdc_sv39 --iss=$DV_SIMULATORS $DV_OPTS --linker=../../config/gen_from_riscv_config/linker/link.ld +make -C ../.. clean +make clean_all +python3 cva6.py --testlist=../tests/testlist_cvxif.yaml --test cvxif_add_nop --iss_yaml cva6.yaml --target cv32a65x --iss=$DV_SIMULATORS $DV_OPTS --linker=../../config/gen_from_riscv_config/cv32a65x/linker/link.ld +make -C ../.. clean +make clean_all + +cd - diff --git a/verif/regress/debug_test.sh b/verif/regress/debug_test.sh new file mode 100644 index 0000000000..ff1773aa81 --- /dev/null +++ b/verif/regress/debug_test.sh @@ -0,0 +1,75 @@ +# Copyright 2024 CoreLab Tech +# +# Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0 +# You may obtain a copy of the License at https://solderpad.org/licenses/ + +noprint="" +if [ "$1" == "--no-print" ]; then + noprint="-DHAS_PRINTF=0" +fi + +# where are the tools +if ! [ -n "$RISCV" ]; then + echo "Error: RISCV variable undefined" + return +fi + +source ./verif/sim/setup-env.sh + +DV_SIMULATORS=vcs-uvm + +if ! [ -n "$UVM_VERBOSITY" ]; then + export UVM_VERBOSITY=UVM_NONE +fi + +export DV_OPTS="$DV_OPTS --issrun_opts=+UVM_VERBOSITY=$UVM_VERBOSITY" +export DV_OPTS="$DV_OPTS --issrun_opts=+mem_vp_enabled=1" + +make clean +make -C verif/sim clean_all + +cd verif/sim/ + +src0=../tests/custom/debug_test/debug_test.c +srcA=( + ../tests/custom/debug_test/debugger_exception.S + ../tests/custom/debug_test/debugger.S + ../tests/custom/debug_test/handlers.S + ../tests/custom/debug_test/single_step.S + ../tests/custom/debug_test/trigger_code.S + ../tests/custom/debug_test/bsp/crt0.S + ../tests/custom/debug_test/bsp/syscalls.c + ../tests/custom/debug_test/bsp/vectors.S +) + +cflags_opt=( + -mabi=ilp32 \ + -march=rv32imac_zicsr_zifencei \ + -Os -g -static -Wall -pedantic \ + -nostartfiles \ + -Wunused-variable \ + -mno-relax +) + +cflags=( + "${cflags_opt[@]}" + "-DCOMPILER_FLAGS='\"${cflags_opt[*]}\"'" + -I../tests/custom/debug_test/ + -I../tests/custom/debug_test/bsp +) + +default_target="cv32a6_imac_sv0" + +link_ld="../tests/custom/debug_test/bsp/link.ld" + +set -x +python3 cva6.py \ + --target "$default_target" \ + --iss="$DV_SIMULATORS" \ + --iss_yaml=cva6.yaml \ + --c_tests "$src0" \ + --gcc_opts "${srcA[*]} ${cflags[*]}" \ + --linker "$link_ld" \ + $DV_OPTS -v diff --git a/verif/regress/dhrystone.sh b/verif/regress/dhrystone.sh index fe029420cf..36938edf7d 100644 --- a/verif/regress/dhrystone.sh +++ b/verif/regress/dhrystone.sh @@ -13,26 +13,36 @@ if ! [ -n "$RISCV" ]; then return fi +if ! [ -n "$DV_SIMULATORS" ]; then + DV_SIMULATORS=vcs-uvm +fi + # install the required tools -source ./verif/regress/install-verilator.sh +if [[ "$DV_SIMULATORS" == *"veri-testharness"* ]]; then + source ./verif/regress/install-verilator.sh +fi source ./verif/regress/install-spike.sh -source verif/regress/install-riscv-compliance.sh -source verif/regress/install-riscv-tests.sh source ./verif/sim/setup-env.sh -if ! [ -n "$DV_SIMULATORS" ]; then - DV_SIMULATORS=veri-testharness +if ! [ -n "$DV_HWCONFIG_OPTS" ]; then + DV_HWCONFIG_OPTS="cv32a65x" +fi + +if ! [ -n "$UVM_VERBOSITY" ]; then + export UVM_VERBOSITY=UVM_NONE fi +export DV_OPTS="$DV_OPTS --issrun_opts=+tb_performance_mode+debug_disable=1+UVM_VERBOSITY=$UVM_VERBOSITY" + make clean make -C verif/sim clean_all cd verif/sim -src0=../tests/riscv-tests/benchmarks/dhrystone/dhrystone_main.c +src0=../tests/custom/dhrystone/dhrystone_main.c srcA=( - ../tests/riscv-tests/benchmarks/dhrystone/dhrystone.c + ../tests/custom/dhrystone/dhrystone.c ../tests/custom/common/syscalls.c ../tests/custom/common/crt.S ) @@ -49,17 +59,15 @@ cflags=( -Wno-implicit-int -I../tests/custom/env -I../tests/custom/common - -I../tests/riscv-tests/benchmarks/dhrystone/ + -I../tests/custom/dhrystone/ -DNOPRINT ) -set -x python3 cva6.py \ --target hwconfig \ - --isa rv64imafdc \ - --hwconfig_opts="cv64a6_imafdc_sv39 +CVA6ConfigNrLoadPipeRegs=0" \ + --hwconfig_opts="$DV_HWCONFIG_OPTS" \ --iss="$DV_SIMULATORS" \ --iss_yaml=cva6.yaml \ --c_tests "$src0" \ --gcc_opts "${srcA[*]} ${cflags[*]}" \ - --linker ../tests/custom/common/test.ld + $DV_OPTS diff --git a/verif/regress/dhrystone_smoke.sh b/verif/regress/dhrystone_smoke.sh new file mode 100644 index 0000000000..fdc7f82f7f --- /dev/null +++ b/verif/regress/dhrystone_smoke.sh @@ -0,0 +1,65 @@ +# Copyright 2022 Thales DIS design services SAS +# +# Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0 +# You may obtain a copy of the License at https://solderpad.org/licenses/ +# +# Original Author: Zbigniew CHAMSKI (zbigniew.chamski@thalesgroup.fr) + +# where are the tools +if ! [ -n "$RISCV" ]; then + echo "Error: RISCV variable undefined" + return +fi + +if ! [ -n "$DV_SIMULATORS" ]; then + DV_SIMULATORS=vcs-uvm +fi + +if ! [ -n "$DV_TARGET" ]; then + DV_TARGET=cv32a65x +fi + +# install the required tools +if [[ "$DV_SIMULATORS" == *"veri-testharness"* ]]; then + source ./verif/regress/install-verilator.sh +fi +source ./verif/regress/install-spike.sh + +source ./verif/sim/setup-env.sh + +make clean +make -C verif/sim clean_all + +cd verif/sim + +src0=../tests/custom/dhrystone/dhrystone_main.c +srcA=( + ../tests/custom/dhrystone/dhrystone.c + ../tests/custom/common/syscalls.c + ../tests/custom/common/crt.S +) +cflags=( + -fno-tree-loop-distribute-patterns + -static + -mcmodel=medany + -fvisibility=hidden + -nostdlib + -nostartfiles + -lgcc + -Os --no-inline + -Wno-implicit-function-declaration + -Wno-implicit-int + -I../tests/custom/env + -I../tests/custom/common + -I../tests/custom/dhrystone/ + -DNOPRINT +) + +python3 cva6.py \ + --target $DV_TARGET \ + --iss="$DV_SIMULATORS" \ + --iss_yaml=cva6.yaml \ + --c_tests "$src0" \ + --gcc_opts "${srcA[*]} ${cflags[*]}" diff --git a/verif/regress/dv-csr-embedded-tests.sh b/verif/regress/dv-csr-embedded-tests.sh index 82752d40d3..0e091230bd 100644 --- a/verif/regress/dv-csr-embedded-tests.sh +++ b/verif/regress/dv-csr-embedded-tests.sh @@ -13,8 +13,14 @@ if ! [ -n "$RISCV" ]; then return fi +if ! [ -n "$DV_SIMULATORS" ]; then + DV_SIMULATORS=vcs-uvm,spike +fi + # install the required tools -source ./verif/regress/install-verilator.sh +if [[ "$DV_SIMULATORS" == *"veri-testharness"* ]]; then + source ./verif/regress/install-verilator.sh +fi source ./verif/regress/install-spike.sh source ./verif/sim/setup-env.sh @@ -26,11 +32,7 @@ if ! [ -n "$DV_TARGET" ]; then DV_TARGET=cv32a65x fi -if ! [ -n "$DV_SIMULATORS" ]; then - DV_SIMULATORS=vcs-uvm,spike -fi - cd verif/sim/ -python3 cva6.py --testlist=../tests/testlist_csr_embedded.yaml --iss_yaml cva6.yaml --target $DV_TARGET --iss=$DV_SIMULATORS $DV_OPTS --priv=m -i 1 +python3 cva6.py --testlist=../tests/testlist_csr_embedded.yaml --iss_yaml cva6.yaml --target $DV_TARGET --iss=$DV_SIMULATORS $DV_OPTS --priv=m --iss_timeout 600 --linker=../../config/gen_from_riscv_config/$DV_TARGET/linker/link.ld cd - diff --git a/verif/regress/dv-generated-tests.sh b/verif/regress/dv-generated-tests.sh index fd7946e042..36eff7eeeb 100644 --- a/verif/regress/dv-generated-tests.sh +++ b/verif/regress/dv-generated-tests.sh @@ -18,7 +18,6 @@ if ! [ -n "$RISCV" ]; then fi # install the required tools -source ./verif/regress/install-verilator.sh source ./verif/regress/install-spike.sh source ./verif/sim/setup-env.sh @@ -27,10 +26,6 @@ if ! [ -n "$DV_TARGET" ]; then DV_TARGET=cv32a65x fi -if ! [ -n "$DV_SIMULATORS" ]; then - DV_SIMULATORS=vcs-uvm,spike -fi - if ! [ -n "$list_num" ]; then list_num=1 #default test list fi @@ -137,6 +132,6 @@ done j=0 elif [[ "$list_num" = 0 ]];then printf "==== Execute Directed tests to improve functional coverage of isa, by hitting corners !!! ====\n\n" - python3 cva6.py --testlist=$DIRECTED_TESTLIST --iss_yaml cva6.yaml --isa_extension="zcb" --target $DV_TARGET --iss=vcs-uvm,spike --priv=m + python3 cva6.py --testlist=$DIRECTED_TESTLIST --iss_yaml cva6.yaml --isa_extension="zcb" --target $DV_TARGET --iss=vcs-uvm,spike --priv=m --linker=../../config/gen_from_riscv_config/$DV_TARGET/linker/link.ld fi cd - diff --git a/verif/regress/dv-generated-xif-tests.sh b/verif/regress/dv-generated-xif-tests.sh index cdff3e5d94..0476e5b56d 100644 --- a/verif/regress/dv-generated-xif-tests.sh +++ b/verif/regress/dv-generated-xif-tests.sh @@ -18,7 +18,6 @@ if ! [ -n "$RISCV" ]; then fi # install the required tools -source ./verif/regress/install-verilator.sh source ./verif/regress/install-spike.sh source ./verif/sim/setup-env.sh @@ -27,10 +26,6 @@ if ! [ -n "$DV_TARGET" ]; then DV_TARGET=cv32a65x fi -if ! [ -n "$DV_SIMULATORS" ]; then - DV_SIMULATORS=vcs-uvm,spike -fi - if ! [ -n "$list_num" ]; then list_num=1 #default test list fi @@ -103,6 +98,6 @@ done j=0 elif [[ "$list_num" = 0 ]];then printf "==== Execute Directed tests to improve functional coverage of isa, by hitting corners !!! ====\n\n" - python3 cva6.py --testlist=$DIRECTED_TESTLIST --iss_yaml cva6.yaml --target $DV_TARGET --iss=vcs-uvm,spike --priv=m + python3 cva6.py --testlist=$DIRECTED_TESTLIST --iss_yaml cva6.yaml --target $DV_TARGET --iss=vcs-uvm,spike --priv=m --linker=../../config/gen_from_riscv_config/$DV_TARGET/linker/link.ld fi cd - diff --git a/verif/regress/dv-riscv-arch-test.sh b/verif/regress/dv-riscv-arch-test.sh index 4d21f36761..827468188f 100644 --- a/verif/regress/dv-riscv-arch-test.sh +++ b/verif/regress/dv-riscv-arch-test.sh @@ -35,5 +35,11 @@ else TESTLIST=../tests/testlist_riscv-arch-test-$DV_TARGET.yaml fi +if ! [ -n "$UVM_VERBOSITY" ]; then + export UVM_VERBOSITY=UVM_NONE +fi + +export DV_OPTS="$DV_OPTS --issrun_opts=+tb_performance_mode+debug_disable=1+UVM_VERBOSITY=$UVM_VERBOSITY" + cd verif/sim python3 cva6.py --testlist=$TESTLIST --target $DV_TARGET --iss_yaml=cva6.yaml --iss=$DV_SIMULATORS $DV_OPTS --linker=../tests/riscv-arch-test/riscv-target/spike/link.ld diff --git a/verif/regress/dv-riscv-compliance.sh b/verif/regress/dv-riscv-compliance.sh index 71d54d3acc..e4eee49e5d 100755 --- a/verif/regress/dv-riscv-compliance.sh +++ b/verif/regress/dv-riscv-compliance.sh @@ -28,6 +28,12 @@ if ! [ -n "$DV_SIMULATORS" ]; then DV_SIMULATORS=veri-testharness,spike fi +if ! [ -n "$UVM_VERBOSITY" ]; then + export UVM_VERBOSITY=UVM_NONE +fi + +export DV_OPTS="$DV_OPTS --issrun_opts=+tb_performance_mode+debug_disable=1+UVM_VERBOSITY=$UVM_VERBOSITY" + cd verif/sim python3 cva6.py --testlist=../tests/testlist_riscv-compliance-$DV_TARGET.yaml --target $DV_TARGET --iss_yaml=cva6.yaml --iss=$DV_SIMULATORS $DV_OPTS cd - diff --git a/verif/regress/dv-riscv-tests.sh b/verif/regress/dv-riscv-tests.sh index 30c845a67c..913244c9ca 100755 --- a/verif/regress/dv-riscv-tests.sh +++ b/verif/regress/dv-riscv-tests.sh @@ -13,8 +13,14 @@ if ! [ -n "$RISCV" ]; then return fi +if ! [ -n "$DV_SIMULATORS" ]; then + DV_SIMULATORS=veri-testharness,spike +fi + # install the required tools -source ./verif/regress/install-verilator.sh +if [[ "$DV_SIMULATORS" == *"veri-testharness"* ]]; then + source ./verif/regress/install-verilator.sh +fi source ./verif/regress/install-spike.sh source verif/regress/install-riscv-tests.sh @@ -24,15 +30,17 @@ if ! [ -n "$DV_TARGET" ]; then DV_TARGET=cv64a6_imafdc_sv39 fi -if ! [ -n "$DV_SIMULATORS" ]; then - DV_SIMULATORS=veri-testharness,spike -fi - if ! [ -n "$DV_TESTLISTS" ]; then DV_TESTLISTS="../tests/testlist_riscv-tests-$DV_TARGET-p.yaml \ ../tests/testlist_riscv-tests-$DV_TARGET-v.yaml" fi +if ! [ -n "$UVM_VERBOSITY" ]; then + export UVM_VERBOSITY=UVM_NONE +fi + +export DV_OPTS="$DV_OPTS --issrun_opts=+tb_performance_mode+debug_disable=1+UVM_VERBOSITY=$UVM_VERBOSITY" + cd verif/sim for TESTLIST in $DV_TESTLISTS do diff --git a/verif/regress/hwconfig_tests.sh b/verif/regress/hwconfig_tests.sh index e634501dfd..8fbf0cd8ae 100644 --- a/verif/regress/hwconfig_tests.sh +++ b/verif/regress/hwconfig_tests.sh @@ -13,20 +13,51 @@ if ! [ -n "$RISCV" ]; then return fi +if ! [ -n "$DV_SIMULATORS" ]; then + DV_SIMULATORS=vcs-uvm +fi + # install the required tools -source ./verif/regress/install-verilator.sh +if [[ "$DV_SIMULATORS" == *"veri-testharness"* ]]; then + source ./verif/regress/install-verilator.sh +fi source ./verif/regress/install-spike.sh -source verif/regress/install-riscv-tests.sh source ./verif/sim/setup-env.sh -if ! [ -n "$DV_SIMULATORS" ]; then - DV_SIMULATORS=veri-testharness,spike +if ! [ -n "$DV_TARGET" ]; then + DV_TARGET=cv32a65x fi -cd verif/sim/ -python3 cva6.py --testlist=../tests/testlist_hwconfig.yaml --iss_yaml cva6.yaml --target hwconfig --isa=rv32imac --hwconfig_opts="$DV_HWCONFIG_OPTS" --iss=$DV_SIMULATORS -make -C ../.. clean -make clean_all +make clean +make -C verif/sim clean_all + +cd verif/sim + +srcA=( + ../tests/custom/common/syscalls.c + ../tests/custom/common/crt.S +) +cflags=( + -static + -mcmodel=medany + -fvisibility=hidden + -nostartfiles + -Oz -fno-inline + -Wno-implicit-function-declaration + -Wno-implicit-int + -I../tests/custom/env + -I../tests/custom/common + -I../tests/custom/dhrystone/ + -DNOPRINT +) + +python3 cva6.py \ + --target "$DV_TARGET" \ + --hwconfig_opts="$DV_HWCONFIG_OPTS" \ + --iss="$DV_SIMULATORS" \ + --iss_yaml=cva6.yaml \ + --c_tests "../tests/custom/return0/return0.c" \ + --gcc_opts "${srcA[*]} ${cflags[*]}" cd - diff --git a/verif/regress/install-spike.sh b/verif/regress/install-spike.sh index 538e86a5a9..e240f64bdb 100755 --- a/verif/regress/install-spike.sh +++ b/verif/regress/install-spike.sh @@ -45,12 +45,18 @@ if ! [ -f "$SPIKE_INSTALL_DIR/bin/spike" ]; then if [[ ! -z "$BOOST_INSTALL_DIR" ]]; then WITH_BOOST="--with-boost=${BOOST_INSTALL_DIR}" fi - if [[ ! -f config.log ]]; then + if [ ! -f config.log ]; then ../configure --prefix="$SPIKE_INSTALL_DIR" ${WITH_BOOST} + else + echo "Spike build dir contains 'config.log', skipping 'configure' step ..." fi + # Build both shared and static versions of the yaml-cpp library in sequence + # prior to building Spike. + make -j${NUM_JOBS} yaml-cpp-static + make -j${NUM_JOBS} yaml-cpp make -j${NUM_JOBS} echo "Installing Spike in '$SPIKE_INSTALL_DIR'..." - make install + make -j${NUM_JOBS} install cd $CALLER_DIR else echo "Spike already installed in '$SPIKE_INSTALL_DIR'." diff --git a/verif/regress/riscv-compliance.patch b/verif/regress/riscv-compliance.patch index 240cd52a0d..45eab76981 100644 --- a/verif/regress/riscv-compliance.patch +++ b/verif/regress/riscv-compliance.patch @@ -15,8 +15,21 @@ index 318b7498..4d6dd039 100644 .align 2; \ 1: +diff --git a/riscv-test-env/p/link.ld b/riscv-test-env/p/link.ld +index 392e74f9..cc4a58e1 100644 +--- a/riscv-test-env/p/link.ld ++++ b/riscv-test-env/p/link.ld +@@ -12,6 +12,8 @@ SECTIONS + . = ALIGN(0x1000); + .tohost : { *(.tohost) } + . = ALIGN(0x1000); ++ .uvmif : { *(.uvmif) } ++ . = ALIGN(0x1000); + .text : { *(.text) } + . = ALIGN(0x1000); + .data : { *(.data) } diff --git a/riscv-test-env/p/riscv_test.h b/riscv-test-env/p/riscv_test.h -index eaa67585..966e2ac0 100644 +index eaa67585..91abcde0 100644 --- a/riscv-test-env/p/riscv_test.h +++ b/riscv-test-env/p/riscv_test.h @@ -63,10 +63,10 @@ @@ -74,6 +87,42 @@ index eaa67585..966e2ac0 100644 //----------------------------------------------------------------------- // Data Section Macro +@@ -238,6 +238,9 @@ end_testcode: \ + .align 8; .global tohost; tohost: .dword 0; \ + .align 8; .global fromhost; fromhost: .dword 0; \ + .popsection; \ ++ .pushsection .uvmif,"aw",@progbits; \ ++ .align 8; .global int_ack; int_ack: .dword 0; \ ++ .popsection; \ + .align 4; .global begin_signature; begin_signature: + + #define RVTEST_DATA_END \ +diff --git a/riscv-test-env/pm/link.ld b/riscv-test-env/pm/link.ld +index b3e315e7..5baa819e 100644 +--- a/riscv-test-env/pm/link.ld ++++ b/riscv-test-env/pm/link.ld +@@ -8,6 +8,8 @@ SECTIONS + . = ALIGN(0x1000); + .tohost : { *(.tohost) } + . = ALIGN(0x1000); ++ .uvmif : { *(.uvmif) } ++ . = ALIGN(0x1000); + .text : { *(.text) } + . = ALIGN(0x1000); + .data : { *(.data) } +diff --git a/riscv-test-env/pt/link.ld b/riscv-test-env/pt/link.ld +index b3e315e7..5baa819e 100644 +--- a/riscv-test-env/pt/link.ld ++++ b/riscv-test-env/pt/link.ld +@@ -8,6 +8,8 @@ SECTIONS + . = ALIGN(0x1000); + .tohost : { *(.tohost) } + . = ALIGN(0x1000); ++ .uvmif : { *(.uvmif) } ++ . = ALIGN(0x1000); + .text : { *(.text) } + . = ALIGN(0x1000); + .data : { *(.data) } diff --git a/riscv-test-env/v/entry.S b/riscv-test-env/v/entry.S index 97196620..37a68ba1 100644 --- a/riscv-test-env/v/entry.S @@ -87,6 +136,19 @@ index 97196620..37a68ba1 100644 STORE t0,34*REGBYTES(sp) csrr t0,scause STORE t0,35*REGBYTES(sp) +diff --git a/riscv-test-env/v/link.ld b/riscv-test-env/v/link.ld +index b3e315e7..5baa819e 100644 +--- a/riscv-test-env/v/link.ld ++++ b/riscv-test-env/v/link.ld +@@ -8,6 +8,8 @@ SECTIONS + . = ALIGN(0x1000); + .tohost : { *(.tohost) } + . = ALIGN(0x1000); ++ .uvmif : { *(.uvmif) } ++ . = ALIGN(0x1000); + .text : { *(.text) } + . = ALIGN(0x1000); + .data : { *(.data) } diff --git a/riscv-test-env/v/vm.c b/riscv-test-env/v/vm.c index 6ab7fd15..1b365a9f 100644 --- a/riscv-test-env/v/vm.c diff --git a/verif/regress/riscv-tests-env.patch b/verif/regress/riscv-tests-env.patch index 7af4257b87..cbe3f38bb4 100644 --- a/verif/regress/riscv-tests-env.patch +++ b/verif/regress/riscv-tests-env.patch @@ -1,5 +1,18 @@ +diff --git a/p/link.ld b/p/link.ld +index b3e315e..5baa819 100644 +--- a/p/link.ld ++++ b/p/link.ld +@@ -8,6 +8,8 @@ SECTIONS + . = ALIGN(0x1000); + .tohost : { *(.tohost) } + . = ALIGN(0x1000); ++ .uvmif : { *(.uvmif) } ++ . = ALIGN(0x1000); + .text : { *(.text) } + . = ALIGN(0x1000); + .data : { *(.data) } diff --git a/p/riscv_test.h b/p/riscv_test.h -index 88ca6c1..b7eb1c2 100644 +index 88ca6c1..def42af 100644 --- a/p/riscv_test.h +++ b/p/riscv_test.h @@ -110,7 +110,7 @@ @@ -35,6 +48,16 @@ index 88ca6c1..b7eb1c2 100644 //----------------------------------------------------------------------- // Data Section Macro +@@ -262,6 +260,9 @@ reset_vector: \ + .align 6; .global tohost; tohost: .dword 0; \ + .align 6; .global fromhost; fromhost: .dword 0; \ + .popsection; \ ++ .pushsection .uvmif,"aw",@progbits; \ ++ .align 6; .global int_ack; int_ack: .dword 0; \ ++ .popsection; \ + .align 4; .global begin_signature; begin_signature: + + #define RVTEST_DATA_END .align 4; .global end_signature; end_signature: diff --git a/v/entry.S b/v/entry.S index fa492e6..49b2d3e 100644 --- a/v/entry.S diff --git a/verif/regress/smoke-gen_tests.sh b/verif/regress/smoke-gen_tests.sh index ef4ad6bf7d..1895dd7fb9 100644 --- a/verif/regress/smoke-gen_tests.sh +++ b/verif/regress/smoke-gen_tests.sh @@ -17,8 +17,14 @@ if ! [ -n "$RISCV" ]; then return fi +if ! [ -n "$DV_SIMULATORS" ]; then + DV_SIMULATORS=vcs-uvm +fi + # install the required tools -source ./verif/regress/install-verilator.sh +if [[ "$DV_SIMULATORS" == *"veri-testharness"* ]]; then + source ./verif/regress/install-verilator.sh +fi source ./verif/regress/install-spike.sh source verif/sim/setup-env.sh @@ -27,10 +33,6 @@ if ! [ -n "$DV_TARGET" ]; then DV_TARGET=cv32a65x fi -if ! [ -n "$DV_SIMULATORS" ]; then - DV_SIMULATORS=vcs-uvm,spike -fi - cd verif/sim/ cp ../env/corev-dv/custom/riscv_custom_instr_enum.sv ./dv/src/isa/custom/ python3 cva6.py --testlist=cva6_base_testlist.yaml --test riscv_arithmetic_basic_test_comp --iss_yaml cva6.yaml --target $DV_TARGET -cs ../env/corev-dv/target/rv32imcb/ --mabi ilp32 --isa rv32imc --isa_extension="zba,zbb,zbc,zbs,zcb" --simulator_yaml ../env/corev-dv/simulator.yaml --iss=$DV_SIMULATORS $DV_OPTS --priv=m -i 1 --iss_timeout 300 diff --git a/verif/regress/smoke-tests-cv32a65x.sh b/verif/regress/smoke-tests-cv32a65x.sh new file mode 100644 index 0000000000..cc1d8a37b7 --- /dev/null +++ b/verif/regress/smoke-tests-cv32a65x.sh @@ -0,0 +1,48 @@ +# Copyright 2021 Thales DIS design services SAS +# +# Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0 +# You may obtain a copy of the License at https://solderpad.org/licenses/ +# +# Original Author: Jean-Roch COULON - Thales + +# where are the tools +if ! [ -n "$RISCV" ]; then + echo "Error: RISCV variable undefined" + return +fi + +if ! [ -n "$DV_SIMULATORS" ]; then + DV_SIMULATORS=vcs-testharness,spike +fi + +# install the required tools +if [[ "$DV_SIMULATORS" == *"veri-testharness"* ]]; then + source ./verif/regress/install-verilator.sh +fi +source ./verif/regress/install-spike.sh + +# setup sim env +source ./verif/sim/setup-env.sh + +echo "$SPIKE_INSTALL_DIR$" + +if ! [ -n "$UVM_VERBOSITY" ]; then + export UVM_VERBOSITY=UVM_NONE +fi + +export DV_OPTS="$DV_OPTS --issrun_opts=+debug_disable=1+UVM_VERBOSITY=$UVM_VERBOSITY" + +CC_OPTS="-static -mcmodel=medany -fvisibility=hidden -nostdlib -nostartfiles -g ../tests/custom/common/syscalls.c ../tests/custom/common/crt.S -I../tests/custom/env -I../tests/custom/common -lgcc" + + +cd verif/sim/ + +make -C ../.. clean +make clean_all +python3 cva6.py --c_tests ../tests/custom/hello_world/hello_world.c --iss_yaml cva6.yaml --target cv32a65x --iss=$DV_SIMULATORS --linker=../../config/gen_from_riscv_config/cv32a65x/linker/link.ld --gcc_opts="$CC_OPTS" $DV_OPTS +make -C ../.. clean +make clean_all + +cd - diff --git a/verif/regress/smoke-tests-cv32a6_imac_sv32.sh b/verif/regress/smoke-tests-cv32a6_imac_sv32.sh new file mode 100644 index 0000000000..6b6d4217d2 --- /dev/null +++ b/verif/regress/smoke-tests-cv32a6_imac_sv32.sh @@ -0,0 +1,56 @@ +# Copyright 2021 Thales DIS design services SAS +# +# Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0 +# You may obtain a copy of the License at https://solderpad.org/licenses/ +# +# Original Author: Jean-Roch COULON - Thales + +# where are the tools +if ! [ -n "$RISCV" ]; then + echo "Error: RISCV variable undefined" + return +fi + +if ! [ -n "$DV_SIMULATORS" ]; then + DV_SIMULATORS=vcs-testharness,spike +fi + +# install the required tools +if [[ "$DV_SIMULATORS" == *"veri-testharness"* ]]; then + source ./verif/regress/install-verilator.sh +fi +source ./verif/regress/install-spike.sh + +# install the required test suites +source ./verif/regress/install-riscv-compliance.sh +source ./verif/regress/install-riscv-tests.sh +source ./verif/regress/install-riscv-arch-test.sh + +# setup sim env +source ./verif/sim/setup-env.sh + +echo "$SPIKE_INSTALL_DIR$" + +if ! [ -n "$UVM_VERBOSITY" ]; then + export UVM_VERBOSITY=UVM_NONE +fi + +export DV_OPTS="$DV_OPTS --issrun_opts=+debug_disable=1+UVM_VERBOSITY=$UVM_VERBOSITY" + +CC_OPTS="-static -mcmodel=medany -fvisibility=hidden -nostdlib -nostartfiles -g ../tests/custom/common/syscalls.c ../tests/custom/common/crt.S -I../tests/custom/env -I../tests/custom/common -lgcc" + + +cd verif/sim/ + +make -C ../.. clean +make clean_all +python3 cva6.py --testlist=../tests/testlist_riscv-compliance-cv32a60x.yaml --test rv32i-I-ADD-01 --iss_yaml cva6.yaml --target cv32a6_imac_sv32 --iss=$DV_SIMULATORS $DV_OPTS +python3 cva6.py --testlist=../tests/testlist_riscv-tests-cv32a60x-p.yaml --test rv32ui-p-add --iss_yaml cva6.yaml --target cv32a6_imac_sv32 --iss=$DV_SIMULATORS $DV_OPTS +python3 cva6.py --testlist=../tests/testlist_riscv-arch-test-cv32a60x.yaml --test rv32im-cadd-01 --iss_yaml cva6.yaml --target cv32a6_imac_sv32 --iss=$DV_SIMULATORS $DV_OPTS --linker=../../config/gen_from_riscv_config/linker/link.ld +python3 cva6.py --c_tests ../tests/custom/hello_world/hello_world.c --iss_yaml cva6.yaml --target cv32a6_imac_sv32 --iss=$DV_SIMULATORS --linker=../../config/gen_from_riscv_config/linker/link.ld --gcc_opts="$CC_OPTS -nostdlib -lgcc" $DV_OPTS +make -C ../.. clean +make clean_all + +cd - diff --git a/verif/regress/smoke-tests-cv64a6_imafdc_sv39.sh b/verif/regress/smoke-tests-cv64a6_imafdc_sv39.sh new file mode 100644 index 0000000000..d60ce010d2 --- /dev/null +++ b/verif/regress/smoke-tests-cv64a6_imafdc_sv39.sh @@ -0,0 +1,58 @@ +# Copyright 2021 Thales DIS design services SAS +# +# Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0 +# You may obtain a copy of the License at https://solderpad.org/licenses/ +# +# Original Author: Jean-Roch COULON - Thales + +# where are the tools +if ! [ -n "$RISCV" ]; then + echo "Error: RISCV variable undefined" + return +fi + +if ! [ -n "$DV_SIMULATORS" ]; then + DV_SIMULATORS=vcs-testharness,spike +fi + +# install the required tools +if [[ "$DV_SIMULATORS" == *"veri-testharness"* ]]; then + source ./verif/regress/install-verilator.sh +fi +source ./verif/regress/install-spike.sh + +# install the required test suites +source ./verif/regress/install-riscv-compliance.sh +source ./verif/regress/install-riscv-tests.sh +source ./verif/regress/install-riscv-arch-test.sh + +# setup sim env +source ./verif/sim/setup-env.sh + +echo "$SPIKE_INSTALL_DIR$" + +if ! [ -n "$UVM_VERBOSITY" ]; then + export UVM_VERBOSITY=UVM_NONE +fi + +export DV_OPTS="$DV_OPTS --issrun_opts=+debug_disable=1+UVM_VERBOSITY=$UVM_VERBOSITY" + +CC_OPTS="-static -mcmodel=medany -fvisibility=hidden -nostdlib -nostartfiles -g ../tests/custom/common/syscalls.c ../tests/custom/common/crt.S -I../tests/custom/env -I../tests/custom/common -lgcc" + + +cd verif/sim/ + +make -C ../.. clean +make clean_all +python3 cva6.py --testlist=../tests/testlist_riscv-compliance-cv64a6_imafdc_sv39.yaml --test rv32i-I-ADD-01 --iss_yaml cva6.yaml --target cv64a6_imafdc_sv39 --iss=$DV_SIMULATORS $DV_OPTS +python3 cva6.py --testlist=../tests/testlist_riscv-tests-cv64a6_imafdc_sv39-v.yaml --test rv64ui-v-add --iss_yaml cva6.yaml --target cv64a6_imafdc_sv39 --iss=$DV_SIMULATORS $DV_OPTS +python3 cva6.py --testlist=../tests/testlist_riscv-tests-cv64a6_imafdc_sv39-p.yaml --test rv64ui-p-add --iss_yaml cva6.yaml --target cv64a6_imafdc_sv39 --iss=$DV_SIMULATORS $DV_OPTS +python3 cva6.py --testlist=../tests/testlist_riscv-arch-test-cv64a6_imafdc_sv39.yaml --test rv64i_m-add-01 --iss_yaml cva6.yaml --target cv64a6_imafdc_sv39 --iss=$DV_SIMULATORS $DV_OPTS --linker=../../config/gen_from_riscv_config/linker/link.ld +python3 cva6.py --testlist=../tests/testlist_custom.yaml --test custom_test_template --iss_yaml cva6.yaml --target cv64a6_imafdc_sv39 --iss=$DV_SIMULATORS $DV_OPTS +python3 cva6.py --c_tests ../tests/custom/hello_world/hello_world.c --iss_yaml cva6.yaml --target cv64a6_imafdc_sv39 --iss=$DV_SIMULATORS --gcc_opts="$CC_OPTS -nostdlib -lgcc" $DV_OPTS --linker=../../config/gen_from_riscv_config/linker/link.ld +make -C ../.. clean +make clean_all + +cd - diff --git a/verif/regress/smoke-tests.sh b/verif/regress/smoke-tests.sh deleted file mode 100644 index 7f3a2ebdf8..0000000000 --- a/verif/regress/smoke-tests.sh +++ /dev/null @@ -1,74 +0,0 @@ -# Copyright 2021 Thales DIS design services SAS -# -# Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0 -# You may obtain a copy of the License at https://solderpad.org/licenses/ -# -# Original Author: Jean-Roch COULON - Thales - -# where are the tools -if ! [ -n "$RISCV" ]; then - echo "Error: RISCV variable undefined" - return -fi - - -# install the required tools -source ./verif/regress/install-verilator.sh -source ./verif/regress/install-spike.sh - -# install the required test suites -source ./verif/regress/install-riscv-compliance.sh -source ./verif/regress/install-riscv-tests.sh -source ./verif/regress/install-riscv-arch-test.sh - -# setup sim env -source ./verif/sim/setup-env.sh - -echo "$SPIKE_INSTALL_DIR$" - -if ! [ -n "$DV_SIMULATORS" ]; then - DV_SIMULATORS=vcs-testharness,spike -fi - -if ! [ -n "$UVM_VERBOSITY" ]; then - export UVM_VERBOSITY=UVM_NONE -fi - -export DV_OPTS="$DV_OPTS --issrun_opts=+debug_disable=1+UVM_VERBOSITY=$UVM_VERBOSITY" - -CC_OPTS="-static -mcmodel=medany -fvisibility=hidden -nostdlib -nostartfiles -g ../tests/custom/common/syscalls.c ../tests/custom/common/crt.S -I../tests/custom/env -I../tests/custom/common -lgcc" - - -cd verif/sim/ -make -C ../.. clean -make clean_all -python3 cva6.py --testlist=../tests/testlist_riscv-compliance-cv64a6_imafdc_sv39.yaml --test rv32i-I-ADD-01 --iss_yaml cva6.yaml --target cv64a6_imafdc_sv39 --iss=$DV_SIMULATORS $DV_OPTS -python3 cva6.py --testlist=../tests/testlist_riscv-tests-cv64a6_imafdc_sv39-v.yaml --test rv64ui-v-add --iss_yaml cva6.yaml --target cv64a6_imafdc_sv39 --iss=$DV_SIMULATORS $DV_OPTS -python3 cva6.py --testlist=../tests/testlist_riscv-tests-cv64a6_imafdc_sv39-p.yaml --test rv64ui-p-add --iss_yaml cva6.yaml --target cv64a6_imafdc_sv39 --iss=$DV_SIMULATORS $DV_OPTS -python3 cva6.py --testlist=../tests/testlist_riscv-arch-test-cv64a6_imafdc_sv39.yaml --test rv64i_m-add-01 --iss_yaml cva6.yaml --target cv64a6_imafdc_sv39 --iss=$DV_SIMULATORS $DV_OPTS --linker=../tests/riscv-arch-test/riscv-target/spike/link.ld -python3 cva6.py --testlist=../tests/testlist_custom.yaml --test custom_test_template --iss_yaml cva6.yaml --target cv64a6_imafdc_sv39 --iss=$DV_SIMULATORS $DV_OPTS -python3 cva6.py --c_tests ../tests/custom/hello_world/hello_world.c --iss_yaml cva6.yaml --target cv64a6_imafdc_sv39 --iss=$DV_SIMULATORS --gcc_opts="$CC_OPTS -T ../tests/custom/common/test.ld" $DV_OPTS -make -C ../.. clean -make clean_all -python3 cva6.py --testlist=../tests/testlist_riscv-compliance-cv32a60x.yaml --test rv32i-I-ADD-01 --iss_yaml cva6.yaml --target cv32a6_imac_sv32 --iss=$DV_SIMULATORS $DV_OPTS -python3 cva6.py --testlist=../tests/testlist_riscv-tests-cv32a60x-p.yaml --test rv32ui-p-add --iss_yaml cva6.yaml --target cv32a6_imac_sv32 --iss=$DV_SIMULATORS $DV_OPTS -python3 cva6.py --testlist=../tests/testlist_riscv-arch-test-cv32a60x.yaml --test rv32im-cadd-01 --iss_yaml cva6.yaml --target cv32a6_imac_sv32 --iss=$DV_SIMULATORS $DV_OPTS --linker=../tests/riscv-arch-test/riscv-target/spike/link.ld -python3 cva6.py --c_tests ../tests/custom/hello_world/hello_world.c --iss_yaml cva6.yaml --target cv32a6_imac_sv32 --iss=$DV_SIMULATORS --linker=../tests/custom/common/test.ld --gcc_opts="$CC_OPTS" $DV_OPTS -make -C ../.. clean -make clean_all -python3 cva6.py --testlist=../tests/testlist_riscv-compliance-cv32a60x.yaml --test rv32i-I-ADD-01 --iss_yaml cva6.yaml --target cv32a65x --iss=$DV_SIMULATORS $DV_OPTS -python3 cva6.py --testlist=../tests/testlist_riscv-tests-cv32a60x-p.yaml --test rv32ui-p-add --iss_yaml cva6.yaml --target cv32a65x --iss=$DV_SIMULATORS $DV_OPTS -python3 cva6.py --testlist=../tests/testlist_riscv-arch-test-cv32a60x.yaml --test rv32im-cadd-01 --iss_yaml cva6.yaml --target cv32a65x --iss=$DV_SIMULATORS $DV_OPTS --linker=../tests/riscv-arch-test/riscv-target/spike/link.ld -python3 cva6.py --c_tests ../tests/custom/hello_world/hello_world.c --iss_yaml cva6.yaml --target cv32a65x --iss=$DV_SIMULATORS --linker=../tests/custom/common/test.ld --gcc_opts="$CC_OPTS" $DV_OPTS -make -C ../.. clean -make clean_all -python3 cva6.py --testlist=../tests/testlist_riscv-compliance-cv64a6_imafdc_sv39.yaml --test rv32i-I-ADD-01 --iss_yaml cva6.yaml --target cv64a6_mmu --iss=$DV_SIMULATORS $DV_OPTS -python3 cva6.py --testlist=../tests/testlist_riscv-tests-cv64a6_imafdc_sv39-p.yaml --test rv64ui-p-add --iss_yaml cva6.yaml --target cv64a6_mmu --iss=$DV_SIMULATORS $DV_OPTS -python3 cva6.py --testlist=../tests/testlist_riscv-arch-test-cv64a6_imafdc_sv39.yaml --test rv64i_m-add-01 --iss_yaml cva6.yaml --target cv64a6_mmu --iss=$DV_SIMULATORS $DV_OPTS --linker=../tests/riscv-arch-test/riscv-target/spike/link.ld -python3 cva6.py --c_tests ../tests/custom/hello_world/hello_world.c --iss_yaml cva6.yaml --target cv64a6_mmu --iss=$DV_SIMULATORS --linker=../tests/custom/common/test.ld --gcc_opts="$CC_OPTS" $DV_OPTS -make -C ../.. clean -make clean_all - -cd - diff --git a/verif/sim/Makefile b/verif/sim/Makefile index 8b3affca4f..c87ef4bcad 100644 --- a/verif/sim/Makefile +++ b/verif/sim/Makefile @@ -63,9 +63,13 @@ endif spike_yaml ?= $(CVA6_REPO_DIR)/config/gen_from_riscv_config/$(target)/spike/spike.yaml # Set up flags for Spike solo and tandem invocations, but only if parameter file exists. +spike_params_final = $(spike_params) ifneq ($(wildcard $(spike_yaml)),) - spike_params += --yaml-param $(spike_yaml) - spike-yaml-plusarg = +config_file=$(CVA6_REPO_DIR)/config/gen_from_riscv_config/$(target)/spike/spike.yaml + spike_params_final := $(spike_params_final) --param-file $(spike_yaml) + spike-yaml-plusarg = +config_file=$(spike_yaml) + spike-yaml-makearg = config_file=$(spike_yaml) +else + spike_params_final := $(spike_params_final) --extension=cvxif endif ############################################## @@ -130,17 +134,13 @@ else cov-run-opt = endif -ifdef cvxif - spike_extension = --extension=cvxif -endif - ############################################################################### # Spike specific commands, variables ############################################################################### spike: LD_LIBRARY_PATH="$$(realpath ../../tools/spike/lib):$$LD_LIBRARY_PATH" \ - $(tool_path)/spike $(spike_stepout) $(spike_extension) --log-commits --isa=$(variant) --priv=$(priv) $(spike_params) -l $(elf) - cp $(log).iss $(log) + $(tool_path)/spike $(spike_stepout) $(spike_extension) --log-commits --isa=$(variant) --priv=$(priv) $(spike_params_final) -l $(elf) + grep -v '^\([[]\|/top/\)' $(log).iss > $(log) ############################################################################### # UVM specific commands, variables @@ -162,7 +162,6 @@ export DV_UVMA_CORE_CNTRL_PATH = $(CORE_V_VERIF)/lib/uvm_agents/uvma_core_cntrl export DV_UVMA_RVFI_PATH = $(CORE_V_VERIF)/lib/uvm_agents/uvma_rvfi export DV_UVMA_ISACOV_PATH = $(CORE_V_VERIF)/lib/uvm_agents/uvma_isacov export DV_UVMA_CLKNRST_PATH = $(CORE_V_VERIF)/lib/uvm_agents/uvma_clknrst -export DV_UVMA_CVXIF_PATH = $(CORE_V_VERIF)/lib/uvm_agents/uvma_cvxif export DV_UVMA_AXI_PATH = $(CORE_V_VERIF)/lib/uvm_agents/uvma_axi5 export DV_UVMA_INTERRUPT_PATH = $(DV_UVME_PATH)/uvma_interrupt export DV_UVMA_DEBUG_PATH = $(CORE_V_VERIF)/lib/uvm_agents/uvma_debug @@ -185,24 +184,27 @@ export SPIKE_PATH = $(CORE_V_VERIF)/vendor/riscv/riscv-isa-sim/ COMMON_COMP_UVM_FLAGS = \ +incdir+$(CVA6_REPO_DIR)/verif/env/uvme +incdir+$(CVA6_REPO_DIR)/verif/tb/uvmt \ + +core_name=$(target) \ $(if $(spike-tandem), +define+SPIKE_TANDEM=1) COMMON_PLUS_ARGS = \ ++$(elf) \ +elf_file=$(elf) \ + +core_name=$(target) \ $(spike-yaml-plusarg) \ +tohost_addr=$(shell $$RISCV/bin/$(CV_SW_PREFIX)nm -B $(elf) | grep -w tohost | cut -d' ' -f1) \ +signature=$(elf).signature_output +UVM_TESTNAME=uvmt_cva6_firmware_test_c \ - $(spike-yaml-plusarg) \ +report_file=$(log).yaml +core_name=$(target) ifneq ($(UVM_VERBOSITY),) COMMON_PLUS_ARGS += +UVM_VERBOSITY=$(UVM_VERBOSITY) endif +# Libraries that provide symbols for other libs must come first. COMMON_RUN_ARGS = \ $(COMMON_PLUS_ARGS) $(issrun_opts) \ -sv_lib $(SPIKE_INSTALL_DIR)/lib/libcustomext \ + -sv_lib $(SPIKE_INSTALL_DIR)/lib/libyaml-cpp \ -sv_lib $(SPIKE_INSTALL_DIR)/lib/libriscv \ -sv_lib $(SPIKE_INSTALL_DIR)/lib/libfesvr \ -sv_lib $(SPIKE_INSTALL_DIR)/lib/libdisasm @@ -222,8 +224,8 @@ ALL_XRUN_UVM_FLAGS = -elaborate -messages -sv +incdir+$(XCELIUM_HOME)/tools -xmerror CUNOTB -nowarn CUDEFB -nowarn CUSRCH -warn_multiple_driver -relax_svbtis -timescale 1ns/1ps -status -access +rwc -log $(XRUN_WORK_DIR)/tb_compile.log ALL_XRUN_SIMV_UVM_FLAGS = +sv_lib=$(CVA6_REPO_DIR)/tools/spike/lib/libdisasm +signature=I-ADD-01.signature_output - -XRUN_RUN_FLAGS := -R -messages -status -64bit -licqueue -noupdate -log xrun.log -uvmhome CDNS-1.2 +UVM_VERBOSITY=UVM_LOW -svseed 1 + +XRUN_RUN_FLAGS := -R -messages -status -64bit -licqueue -noupdate -log xrun.log -uvmhome CDNS-1.2 +UVM_VERBOSITY=UVM_LOW -svseed 1 XRUN_DISABLED_WARNINGS := BIGWIX \ ZROMCW \ @@ -236,7 +238,7 @@ XRUN_DISABLED_WARNINGS := $(patsubst %, -nowarn %, $(XRUN_DISABLED_WARNINGS)) XRUN_RUN = $(XRUN_RUN_FLAGS) \ $(ALL_XRUN_SIMV_UVM_FLAGS) \ - $(XRUN_DISABLED_WARNINGS) + $(XRUN_DISABLED_WARNINGS) ifneq ($(DEBUG),) # If RTL DEBUG support requested ifneq ($(VERDI),) # If VERDI interactive mode requested, use GUI and do not run simulation @@ -271,9 +273,10 @@ vcs_uvm_comp: cd $(VCS_WORK_DIR) && vcs $(COMMON_COMP_UVM_FLAGS) $(ALL_UVM_FLAGS) \ -f $(FLIST_CORE) -f $(FLIST_TB) \ -f $(CVA6_UVMT_DIR)/uvmt_cva6.flist \ - $(cov-comp-opt) +define+UNSUPPORTED_WITH+ $(isscomp_opts)\ + $(cov-comp-opt) $(isscomp_opts)\ -ignore initializer_driver_checks \ - -top uvmt_cva6_tb + -top uvmt_cva6_tb \ + $(if $(gate), -sdf Max:uvmt_cva6_tb.cva6_dut_wrap.cva6_tb_wrapper_i.i_cva6:$(CVA6_REPO_DIR)/pd/synth/cva6_$(TARGET_CFG)_synth.sdf +neg_tchk, +notimingcheck) vcs_uvm_run: vcs_uvm_comp $(if $(TRACE_FAST), unset VERDI_HOME ;) \ @@ -295,7 +298,7 @@ vcs-uvm: vcs_uvm_comp vcs_uvm_run ### XRUN UVM rules -xrun_uvm_comp: +xrun_uvm_comp: @echo "[XRUN] Building Model" mkdir -p $(XRUN_WORK_DIR) cd $(XRUN_WORK_DIR) && \ @@ -306,11 +309,13 @@ xrun_uvm_comp: $(cov-comp-opt) $(isscomp_opts)\ -top uvmt_cva6_tb +# Libraries that provide symbols for other libs must come first. xrun_uvm_run: @echo "[XRUN] Running" cd $(XRUN_WORK_DIR) && \ xrun \ $(XRUN_RUN) \ + +sv_lib=$(SPIKE_INSTALL_DIR)/lib/libyaml-cpp \ +sv_lib=$(SPIKE_INSTALL_DIR)/lib/libriscv \ +sv_lib=$(SPIKE_INSTALL_DIR)/lib/libfesvr \ +sv_lib=$(SPIKE_INSTALL_DIR)/lib/libdisasm \ @@ -320,7 +325,7 @@ xrun_uvm_run: +UVM_TESTNAME=uvmt_cva6_firmware_test_c \ +tohost_addr=$(shell ${RISCV}/bin/${CV_SW_PREFIX}nm -B $(elf) | grep -w tohost | cut -d' ' -f1) \ $(cov-comp-opt) $(issrun_opts) - + xrun-uvm: xrun_uvm_comp xrun_uvm_run $(tool_path)/spike-dasm --isa=$(variant) < ./xrun_results/trace_rvfi_hart_00.dasm > $(log) @@ -428,7 +433,8 @@ xrun-testharness: questa-testharness: mkdir -p $(path_var)/tmp - make -C $(path_var) sim target=$(target) defines=$(subst +define+,,$(isscomp_opts)) batch-mode=1 elf_file=$(elf) \ + make -C $(path_var) sim target=$(target) defines=$(subst +define+,,$(isscomp_opts)+core_name=$(target)) batch-mode=1 elf_file=$(elf) \ + report_file=$(log).yaml $(spike-yaml-makearg) # TODO: Add support for waveform collection. $(tool_path)/spike-dasm --isa=$(variant) < $(path_var)/trace_rvfi_hart_00.dasm > $(log) grep $(isspostrun_opts) $(path_var)/trace_rvfi_hart_00.dasm @@ -437,7 +443,7 @@ questa-testharness: # Common targets and rules ############################################################################### -clean_all: vcs_clean_all +clean_all: vcs_clean_all rm -f *.txt rm -f trace*.log rm -f trace*.dasm diff --git a/verif/sim/cva6.py b/verif/sim/cva6.py index bdcd83c183..29e79a0656 100644 --- a/verif/sim/cva6.py +++ b/verif/sim/cva6.py @@ -33,7 +33,7 @@ from dv.scripts.whisper_log_trace_csv import * from dv.scripts.sail_log_to_trace_csv import * from dv.scripts.instr_trace_compare import * - +from pathlib import Path from types import SimpleNamespace LOGGER = logging.getLogger() @@ -158,6 +158,7 @@ def parse_iss_yaml(iss, iss_yaml, isa, target, setting_dir, debug_cmd, priv, spi else: cmd = re.sub(r"\", isa, cmd) cmd = re.sub(r"\", priv, cmd) + cmd = re.sub(r"\", target, cmd) return cmd logging.error("Cannot find ISS %0s" % iss) sys.exit(RET_FAIL) @@ -411,217 +412,108 @@ def gcc_compile(test_list, output_dir, isa, mabi, opts, debug_cmd, linker): elf2bin(elf, binary, debug_cmd) -def run_assembly(asm_test, iss_yaml, isa, target, mabi, gcc_opts, iss_opts, output_dir, - setting_dir, debug_cmd, linker, priv, spike_params, test_name = None, iss_timeout=500): - """Run a directed assembly test with ISS - - Args: - asm_test : Assembly test file - iss_yaml : ISS configuration file in YAML format - isa : ISA variant passed to the ISS - mabi : MABI variant passed to GCC - gcc_opts : User-defined options for GCC compilation - iss_opts : Instruction set simulators - output_dir : Output directory of compiled test files - setting_dir : Generator setting directory - debug_cmd : Produce the debug cmd log without running - linker : Path to the linker - iss_timeout : Timeout for ISS simulation - """ - if not asm_test.endswith(".S"): - logging.error("%s is not an assembly .S file" % asm_test) - return - cwd = os.path.dirname(os.path.realpath(__file__)) - asm_test = os.path.expanduser(asm_test) - report = ("%s/iss_regr.log" % output_dir).rstrip() - asm = re.sub(r"^.*\/", "", asm_test) - asm = re.sub(r"\.S$", "", asm) - if os.getenv('cov'): - asm = asm + "-" + str(datetime.datetime.now().isoformat()) - prefix = ("%s/directed_asm_tests/%s" % (output_dir, asm)) - elf = prefix + ".o" - binary = prefix + ".bin" - iss_list = iss_opts.split(",") - run_cmd("mkdir -p %s/directed_asm_tests" % output_dir) - logging.info("Compiling assembly test: %s" % asm_test) - - # gcc compilation - cmd = ("%s %s \ - -I%s/../env/corev-dv/user_extension \ - -T%s %s -o %s " % \ - (get_env_var("RISCV_CC", debug_cmd = debug_cmd), asm_test, cwd, linker, - gcc_opts, elf)) - cmd += (" -march=%s" % isa) - cmd += (" -mabi=%s" % mabi) - run_cmd_output(cmd.split(), debug_cmd = debug_cmd) - elf2bin(elf, binary, debug_cmd) - log_list = [] - # ISS simulation - test_log_name = test_name or asm - for iss in iss_list: - run_cmd("mkdir -p %s/%s_sim" % (output_dir, iss)) - if log_format == 1: - log = ("%s/%s_sim/%s_%d.%s.log" % (output_dir, iss, test_log_name, test_iteration, target)) - else: - log = ("%s/%s_sim/%s.%s.log" % (output_dir, iss, test_log_name, target)) - yaml = ("%s/%s_sim/%s.%s.log.yaml" % (output_dir, iss, test_log_name, target)) - log_list.append(log) - base_cmd = parse_iss_yaml(iss, iss_yaml, isa, target, setting_dir, debug_cmd, priv, spike_params) - cmd = get_iss_cmd(base_cmd, elf, target, log) - logging.info("[%0s] Running ISS simulation: %s" % (iss, cmd)) - if "spike" in iss: ratio = 10 - else: ratio = 1 - run_cmd(cmd, iss_timeout//ratio, debug_cmd = debug_cmd) - logging.info("[%0s] Running ISS simulation: %s ...done" % (iss, elf)) - if (iss != "spike" and os.environ.get('SPIKE_TANDEM') != None): - analize_result_yaml(yaml) - - if len(iss_list) == 2: - compare_iss_log(iss_list, log_list, report) - -def run_assembly_from_dir(asm_test_dir, iss_yaml, isa, mabi, gcc_opts, iss, - output_dir, setting_dir, debug_cmd, iss_timeout=500): - """Run a directed assembly test from a directory with spike +def tandem_postprocess(tandem_report, target, isa, test_name, log, testlist, iss, iterations = None): + analyze_tandem_report(tandem_report) + process_verilator_sim_log(log, log + ".csv") + generate_yaml_report(tandem_report, target, isa, test_name, testlist, iss, False , iterations) - Args: - asm_test_dir : Assembly test file directory - iss_yaml : ISS configuration file in YAML format - isa : ISA variant passed to the ISS - mabi : MABI variant passed to GCC - gcc_opts : User-defined options for GCC compilation - iss : Instruction set simulators - output_dir : Output directory of compiled test files - setting_dir : Generator setting directory - debug_cmd : Produce the debug cmd log without running - iss_timeout : Timeout for ISS simulation - """ - result = run_cmd("find %s -name \"*.S\"" % asm_test_dir) - if result: - asm_list = result.splitlines() - logging.info("Found %0d assembly tests under %s" % - (len(asm_list), asm_test_dir)) - for asm_file in asm_list: - run_assembly(asm_file, iss_yaml, isa, target, mabi, gcc_opts, iss, output_dir, - setting_dir, debug_cmd, linker, iss_timeout=iss_timeout) - if "," in iss: - report = ("%s/iss_regr.log" % output_dir).rstrip() - save_regr_report(report) +def analyze_tandem_report(yaml_path): + with open(yaml_path, 'r') as f: + data = yaml.safe_load(f) + try: + mismatches_count = (data["mismatches_count"]) + instr_count = (data["instr_count"]) + exit_code = (data["exit_code"]) + matches_count = instr_count - mismatches_count + logging.info("TANDEM Result : %s (exit code %s) with %s mismatches and %s matches" + % (data["exit_cause"], exit_code, mismatches_count, matches_count)) + except KeyError: + logging.info("Incomplete TANDEM YAML report") + + +def generate_yaml_report(yaml_path, target, isa, test, testlist, iss, initial_creation , iteration = None): + if not initial_creation: + with open(yaml_path, 'r') as f: + report = yaml.safe_load(f) else: - logging.error("No assembly test(*.S) found under %s" % asm_test_dir) - -def analize_result_yaml(yaml_path): - - if (os.path.exists(yaml_path)): - with open(yaml_path, 'r') as f: - data = yaml.safe_load(f) - mismatches = data["mismatches"] - mismatches_count = (data["mismatches_count"]) - instr_count = (data["instr_count"]) - matches_count = instr_count - mismatches_count - logging.info("TANDEM Result : %s with %s mismatches and %s matches" - % (data["exit_cause"], mismatches_count, matches_count)) - else: - logging.info("TANDEM YAML not found") + report = {"exit_cause": "UNKNOWN"} + report["target"] = target + report["isa"] = isa + report["test"] = test + report["testlist"] = testlist + report["simulator"] = iss + if iteration != None: + report["iteration"] = iteration + with open(yaml_path, "w") as f: + yaml.dump(report, f) -# python3 run.py --target rv64gc --iss=spike,verilator --elf_tests bbl.o -def run_elf(c_test, iss_yaml, isa, target, mabi, gcc_opts, iss_opts, output_dir, - setting_dir, debug_cmd, priv, spike_params, iss_timeout=50000): - """Run a directed c test with ISS - - Args: - c_test : C test file - iss_yaml : ISS configuration file in YAML format - isa : ISA variant passed to the ISS - mabi : MABI variant passed to GCC - gcc_opts : User-defined options for GCC compilation - iss_opts : Instruction set simulators - output_dir : Output directory of compiled test files - setting_dir : Generator setting directory - debug_cmd : Produce the debug cmd log without running - """ - if not c_test.endswith(".o"): - logging.error("%s is not a .o file" % c_test) - return - cwd = os.path.dirname(os.path.realpath(__file__)) - c_test = os.path.expanduser(c_test) - report = ("%s/iss_regr.log" % output_dir).rstrip() - c = re.sub(r"^.*\/", "", c_test) - c = re.sub(r"\.o$", "", c) - prefix = ("%s/directed_elf_tests/%s" % (output_dir, c)) - elf = prefix + ".o" - binary = prefix + ".bin" - iss_list = iss_opts.split(",") - run_cmd("mkdir -p %s/directed_elf_tests" % output_dir, 600, debug_cmd=debug_cmd) - logging.info("Copy elf test: %s" % c_test) - run_cmd("cp %s %s/directed_elf_tests" % (c_test, output_dir)) - elf2bin(elf, binary, debug_cmd) - log_list = [] - # ISS simulation - for iss in iss_list: - run_cmd("mkdir -p %s/%s_sim" % (output_dir, iss)) - log = ("%s/%s_sim/%s.%s.log" % (output_dir, iss, c, target)) - yaml = ("%s/%s_sim/%s.%s.log.yaml" % (output_dir, iss, c, target)) - log_list.append(log) - base_cmd = parse_iss_yaml(iss, iss_yaml, isa, target, setting_dir, debug_cmd, priv, spike_params) - cmd = get_iss_cmd(base_cmd, elf, target, log) - logging.info("[%0s] Running ISS simulation: %s" % (iss, cmd)) - if "veri" in iss: ratio = 35 - elif "spike" in iss: ratio = 0.1 - else: ratio = 1 - run_cmd(cmd, int(iss_timeout*ratio), debug_cmd = debug_cmd) - logging.info("[%0s] Running ISS simulation: %s ...done" % (iss, elf)) - - if len(iss_list) == 2: - compare_iss_log(iss_list, log_list, report) -def run_c(c_test, iss_yaml, isa, target, mabi, gcc_opts, iss_opts, output_dir, - setting_dir, debug_cmd, linker, priv, spike_params, test_name = None, iss_timeout=500): - """Run a directed c test with ISS +def run_test(test, iss_yaml, isa, target, mabi, gcc_opts, iss_opts, output_dir, + setting_dir, debug_cmd, linker, priv, spike_params, test_name=None, iss_timeout=500, testlist="custom"): + """Run a directed test with ISS Args: - c_test : C test file + test : C test file iss_yaml : ISS configuration file in YAML format isa : ISA variant passed to the ISS + target : Target simulator name mabi : MABI variant passed to GCC gcc_opts : User-defined options for GCC compilation - iss_opts : Instruction set simulators + iss_opts : Options for the instruction set simulators output_dir : Output directory of compiled test files setting_dir : Generator setting directory - debug_cmd : Produce the debug cmd log without running + debug_cmd : Produce the debug command log without running linker : Path to the linker - iss_timeout : Timeout for ISS simulation + priv : Privilege mode of the test + spike_params: Parameters for the Spike ISS + test_name : (Optional) Name of the test + iss_timeout : Timeout for ISS simulation (default: 500) + testlist : Test list identifier (default: "custom") """ - if not c_test.endswith(".c"): - logging.error("%s is not a .c file" % c_test) - return + if testlist != None: + testlist = testlist.split('/')[-1].strip("testlist_").split('.')[0] + + if test.endswith(".c"): + test_type = "c" + elif test.endswith(".S"): + test_type = "S" + elif test.endswith(".o"): + test_type = "o" + else: + sys.exit("Unknown test extension!") + cwd = os.path.dirname(os.path.realpath(__file__)) - c_test = os.path.expanduser(c_test) + test_path = os.path.expanduser(test) report = ("%s/iss_regr.log" % output_dir).rstrip() - c = re.sub(r"^.*\/", "", c_test) - c = re.sub(r"\.c$", "", c) - prefix = (f"{output_dir}/directed_c_tests/{c}") - elf = prefix + ".o" - binary = prefix + ".bin" + test = re.sub(r"^.*\/", "", test_path) + test = re.sub(rf"\.{test_type}$", "", test) + prefix = (f"{output_dir}/directed_tests/{test}") + if test_type == "o": + elf = test_path + else: + elf = prefix + ".o" + iss_list = iss_opts.split(",") - run_cmd("mkdir -p %s/directed_c_tests" % output_dir) - logging.info("Compiling c test: %s" % c_test) - - # gcc compilation - cmd = ("%s %s \ - -I%s/dv/user_extension \ - -T%s %s -o %s " % \ - (get_env_var("RISCV_CC", debug_cmd = debug_cmd), c_test, cwd, - linker, gcc_opts, elf)) - cmd += (" -march=%s" % isa) - cmd += (" -mabi=%s" % mabi) - run_cmd(cmd, debug_cmd = debug_cmd) - elf2bin(elf, binary, debug_cmd) + run_cmd("mkdir -p %s/directed_tests" % output_dir) + + if test_type != "o": + # gcc compilation + logging.info("Compiling test: %s" % test) + cmd = ("%s %s \ + -I%s/dv/user_extension \ + -T%s %s -o %s " % \ + (get_env_var("RISCV_CC", debug_cmd = debug_cmd), test_path, cwd, + linker, gcc_opts, elf)) + cmd += (" -march=%s" % isa) + cmd += (" -mabi=%s" % mabi) + run_cmd(cmd, debug_cmd = debug_cmd) log_list = [] # ISS simulation - test_log_name = test_name or c + test_log_name = test_name or test for iss in iss_list: + tandem_sim = iss != "spike" and os.environ.get('SPIKE_TANDEM') != None run_cmd("mkdir -p %s/%s_sim" % (output_dir, iss)) if log_format == 1: log = ("%s/%s_sim/%s_%d.%s.log" % (output_dir, iss, test_log_name, test_iteration, target)) @@ -630,50 +522,23 @@ def run_c(c_test, iss_yaml, isa, target, mabi, gcc_opts, iss_opts, output_dir, yaml = ("%s/%s_sim/%s.%s.log.yaml" % (output_dir, iss, test_log_name, target)) log_list.append(log) base_cmd = parse_iss_yaml(iss, iss_yaml, isa, target, setting_dir, debug_cmd, priv, spike_params) + print(elf) cmd = get_iss_cmd(base_cmd, elf, target, log) logging.info("[%0s] Running ISS simulation: %s" % (iss, cmd)) if "spike" in iss: ratio = 10 else: ratio = 1 + if tandem_sim: + generate_yaml_report(yaml, target, isa, test_log_name, testlist, iss, True) run_cmd(cmd, iss_timeout//ratio, debug_cmd = debug_cmd) logging.info("[%0s] Running ISS simulation: %s ...done" % (iss, elf)) - if (iss != "spike" and os.environ.get('SPIKE_TANDEM') != None): - analize_result_yaml(yaml) + if tandem_sim: + tandem_postprocess(yaml, target, isa, test_log_name, log, testlist, iss) if len(iss_list) == 2: compare_iss_log(iss_list, log_list, report) -def run_c_from_dir(c_test_dir, iss_yaml, isa, mabi, gcc_opts, iss, - output_dir, setting_dir, debug_cmd, priv, iss_timeout): - """Run a directed c test from a directory with spike - - Args: - c_test_dir : C test file directory - iss_yaml : ISS configuration file in YAML format - isa : ISA variant passed to the ISS - mabi : MABI variant passed to GCC - gcc_opts : User-defined options for GCC compilation - iss : Instruction set simulators - output_dir : Output directory of compiled test files - setting_dir : Generator setting directory - debug_cmd : Produce the debug cmd log without running - """ - result = run_cmd("find %s -name \"*.c\"" % c_test_dir) - if result: - c_list = result.splitlines() - logging.info("Found %0d c tests under %s" % - (len(c_list), c_test_dir)) - for c_file in c_list: - run_c(c_file, iss_yaml, isa, target, mabi, gcc_opts, iss, output_dir, - setting_dir, debug_cmd, linker, priv, iss_timeout=iss_timeout) - if "," in iss: - report = ("%s/iss_regr.log" % output_dir).rstrip() - save_regr_report(report) - else: - logging.error("No c test(*.c) found under %s" % c_test_dir) - - def iss_sim(test_list, output_dir, iss_list, iss_yaml, iss_opts, isa, target, setting_dir, timeout_s, debug_cmd, priv, spike_params): """Run ISS simulation with the generated test program @@ -694,6 +559,7 @@ def iss_sim(test_list, output_dir, iss_list, iss_yaml, iss_opts, base_cmd = parse_iss_yaml(iss, iss_yaml, isa, target, setting_dir, debug_cmd, priv, spike_params) logging.info("%s sim log dir: %s" % (iss, log_dir)) run_cmd_output(["mkdir", "-p", log_dir]) + tandem_sim = iss != "spike" and os.environ.get('SPIKE_TANDEM') != None for test in test_list: if 'no_iss' in test and test['no_iss'] == 1: continue @@ -703,15 +569,20 @@ def iss_sim(test_list, output_dir, iss_list, iss_yaml, iss_opts, elf = prefix + ".o" log = ("%s/%s_%d.%s.log" % (log_dir, test['test'], i, target)) cmd = get_iss_cmd(base_cmd, elf, target, log) + yaml = ("%s/%s_%s.%s.log.yaml" % (log_dir, test['test'], i, target)) if 'iss_opts' in test: cmd += ' ' cmd += test['iss_opts'] logging.info("Running %s sim: %s" % (iss, elf)) + if tandem_sim: + generate_yaml_report(yaml, target, isa, test['test'], "generated tests", iss, True, i) if iss == "ovpsim": run_cmd(cmd, timeout_s, check_return_code=False, debug_cmd = debug_cmd) else: run_cmd(cmd, timeout_s, debug_cmd = debug_cmd) logging.debug(cmd) + if tandem_sim: + tandem_postprocess(yaml, target, isa, test['test'], log, "generated tests", iss, i) def iss_cmp(test_list, iss, target, output_dir, stop_on_first_error, exp, debug_cmd): @@ -973,7 +844,11 @@ def load_config(args, cwd): args.simulator_yaml = cwd + "/cva6-simulator.yaml" if not args.linker: - args.linker = cwd + "/link.ld" + my_link = Path(cwd + f"/../../config/gen_from_riscv_config/{args.target}/linker/link.ld") + if my_link.is_file(): + args.linker = cwd + f"/../../config/gen_from_riscv_config/{args.target}/linker/link.ld" + else: + args.linker = cwd + f"/../../config/gen_from_riscv_config/linker/link.ld" # Keep the core_setting_dir option to be backward compatible, suggest to use # --custom_target @@ -983,74 +858,81 @@ def load_config(args, cwd): else: args.core_setting_dir = args.custom_target + base = "" if not args.custom_target: if not args.testlist: args.testlist = cwd + "/target/"+ args.target +"/testlist.yaml" - if args.target in ("cv64a6_imafdc_sv39", "cv64a6_imafdc_sv39_hpdcache", "cv64a6_imafdc_sv39_wb"): + if args.target == "hwconfig": + base, changes = user_config.parse_derive_args(args.hwconfig_opts.split()) + input_file = f"../../core/include/{base}_config_pkg.sv" + output_file = "../../core/include/hwconfig_config_pkg.sv" + user_config.derive_config(input_file, output_file, changes) + args.hwconfig_opts = user_config.get_config(output_file) + os.system("mkdir -p ../../config/gen_from_riscv_config/hwconfig/spike") + os.system("mkdir -p ../../config/gen_from_riscv_config/hwconfig/linker") + os.system("cp ../../config/gen_from_riscv_config/%s/spike/spike.yaml ../../config/gen_from_riscv_config/hwconfig/spike/" % (base)) + os.system("cp ../../config/gen_from_riscv_config/%s/linker/*.ld ../../config/gen_from_riscv_config/hwconfig/linker/" % (base)) + else: + base = args.target + if base in ("cv64a6_imafdc_sv39", "cv64a6_imafdc_sv39_hpdcache", "cv64a6_imafdc_sv39_wb"): args.mabi = "lp64d" args.isa = "rv64gc_zba_zbb_zbs_zbc" - elif args.target == "cv32a60x": # step1 configuration + elif base == "cv32a60x": args.mabi = "ilp32" - args.isa = "rv32imac_zba_zbb_zbs_zbc" - elif args.target == "cv32a65x": + args.isa = "rv32imc_zba_zbb_zbs_zbc" + args.priv = "m" + elif base == "cv32a65x": args.mabi = "ilp32" args.isa = "rv32imc_zba_zbb_zbs_zbc" args.priv = "m" - elif args.target == "cv64a6_mmu": + elif base == "cv64a6_mmu": args.mabi = "lp64" args.isa = "rv64imac_zba_zbb_zbs_zbc" - elif args.target == "cv32a6_imac_sv0": + elif base == "cv32a6_imac_sv0": args.mabi = "ilp32" args.isa = "rv32imac" - elif args.target == "cv32a6_imac_sv32": + elif base == "cv32a6_imac_sv32": args.mabi = "ilp32" args.isa = "rv32imac" - elif args.target == "cv32a6_imafc_sv32": + elif base == "cv32a6_imafc_sv32": args.mabi = "ilp32f" args.isa = "rv32imafc" - elif args.target == "rv32imc": + elif base == "rv32imc": args.mabi = "ilp32" args.isa = "rv32imc" - elif args.target == "rv32imac": + elif base == "rv32imac": args.mabi = "ilp32" args.isa = "rv32imac" - elif args.target == "rv32ima": + elif base == "rv32ima": args.mabi = "ilp32" args.isa = "rv32ima" - elif args.target == "rv32gc": + elif base == "rv32gc": args.mabi = "ilp32f" args.isa = "rv32gc" - elif args.target == "multi_harts": + elif base == "multi_harts": args.mabi = "ilp32f" args.isa = "rv32gc" - elif args.target == "rv32imcb": + elif base == "rv32imcb": args.mabi = "ilp32" args.isa = "rv32imcb" - elif args.target == "rv32i": + elif base == "rv32i": args.mabi = "ilp32" args.isa = "rv32i" - elif args.target == "rv64imc": + elif base == "rv64imc": args.mabi = "lp64" args.isa = "rv64imc" - elif args.target == "rv64gc": + elif base == "rv64gc": args.mabi = "lp64d" args.isa = "rv64gc" - elif args.target == "rv64imac": + elif base == "rv64imac": args.mabi = "lp64" args.isa = "rv64imac" - elif args.target == "rv64gcv": + elif base == "rv64gcv": args.mabi = "lp64d" args.isa = "rv64gcv" - elif args.target == "ml": + elif base == "ml": args.mabi = "lp64" args.isa = "rv64imc" - elif args.target == "hwconfig": - base, changes = user_config.parse_derive_args(args.hwconfig_opts.split()) - input_file = f"../../core/include/{base}_config_pkg.sv" - output_file = "../../core/include/hwconfig_config_pkg.sv" - user_config.derive_config(input_file, output_file, changes) - args.hwconfig_opts = user_config.get_config(output_file) - args.mabi = 'ilp32' if args.hwconfig_opts['XLEN'] == 32 else 'lp64' else: sys.exit("Unsupported pre-defined target: %0s" % args.target) args.core_setting_dir = cwd + "/dv" + "/target/"+ args.isa @@ -1099,15 +981,39 @@ def check_spike_version(): # Get Spike User version get_env_var("SPIKE_PATH") user_spike_version = subprocess.run("$SPIKE_PATH/spike -v", capture_output=True, text=True, shell=True) - user_spike_version_string = user_spike_version.stderr.strip() + user_spike_stdout_string = user_spike_version.stdout.strip() + user_spike_stderr_string = user_spike_version.stderr.strip() if user_spike_version.returncode != 0: + # Re-run 'spike -v' and print contents of stdout and stderr. + logging.info("Spike version check ('$SPIKE_PATH/spike -v')") + logging.info(f"- stdout:\n\n{user_spike_stdout_string}\n") + logging.info(f"- stderr:\n\n{user_spike_stderr_string}") + # Run 'ldd' on Spike binary and print contents of stdout and stderr. + spike_ldd = subprocess.run( + "/bin/ldd $SPIKE_PATH/spike", capture_output=True, text=True, shell=True + ) + spike_ldd_stdout = spike_ldd.stdout.strip() + spike_ldd_stderr = spike_ldd.stderr.strip() + logging.info("Spike LDD check ('ldd $SPIKE_PATH/spike')") + logging.info(f"- stdout:\n\n{spike_ldd_stdout}\n") + logging.info(f"- stderr:\n\n{spike_ldd_stderr}") + # Run 'ls -l' on Spike lib directory and print contents of stdout and stderr. + spike_lib_ls = subprocess.run( + "ls -l $SPIKE_PATH/../lib", capture_output=True, text=True, shell=True + ) + spike_lib_stdout = spike_lib_ls.stdout.strip() + spike_lib_stderr = spike_lib_ls.stderr.strip() + logging.info("Stdout of Spike library check ('ls -l $SPIKE_PATH/../lib')") + logging.info(f"- stdout:\n\n{spike_lib_stdout}\n") + logging.info(f"- stderr:\n\n{spike_lib_stderr}") + incorrect_version_exit("Spike", "- unknown -", spike_version) - logging.info(f"Spike Version: {user_spike_version_string}") + logging.info(f"Spike Version: {user_spike_stderr_string}") - if user_spike_version_string != spike_version: - incorrect_version_exit("Spike", user_spike_version_string, spike_version) + if user_spike_stderr_string != spike_version: + incorrect_version_exit("Spike", user_spike_stderr_string, spike_version) def check_verilator_version(): @@ -1123,7 +1029,7 @@ def check_verilator_version(): def check_tools_version(): check_cc_version() - check_spike_version() + # check_spike_version() check_verilator_version() @@ -1244,57 +1150,25 @@ def main(): test_iteration = i print("") logging.info("Iteration number: %s" % (i+1)) - # Run any handcoded/directed assembly tests specified by args.asm_tests - if args.asm_tests != "": - asm_test = args.asm_tests.split(',') - for path_asm_test in asm_test: - full_path = os.path.expanduser(path_asm_test) - # path_asm_test is a directory - if os.path.isdir(full_path): - run_assembly_from_dir(full_path, args.iss_yaml, args.isa, args.mabi, - args.gcc_opts, args.iss, output_dir, - args.core_setting_dir, args.debug, args.priv, iss_timeout=args.iss_timeout) - # path_asm_test is an assembly file - elif os.path.isfile(full_path) or args.debug: - run_assembly(full_path, args.iss_yaml, args.isa, args.target, args.mabi, args.gcc_opts, - args.iss, output_dir, args.core_setting_dir, args.debug, args.linker, - args.priv, args.spike_params, iss_timeout=args.iss_timeout) - else: - logging.error('%s does not exist' % full_path) - sys.exit(RET_FAIL) - test_executed = 1 - # Run any handcoded/directed c tests specified by args.c_tests + # Run any handcoded/directed tests specified by args + tests = "" if args.c_tests != "": - c_test = args.c_tests.split(',') - for path_c_test in c_test: - full_path = os.path.expanduser(path_c_test) - # path_c_test is a directory - if os.path.isdir(full_path): - run_c_from_dir(full_path, args.iss_yaml, args.isa, args.mabi, - args.gcc_opts, args.iss, output_dir, - args.core_setting_dir, args.debug, args.priv, args.iss_timeout) + tests = args.c_tests.split(',') + elif args.elf_tests != "": + tests = args.elf_tests.split(',') + elif args.asm_tests != "": + tests = args.asm_tests.split(',') + if tests != "": + for path_test in tests: + full_path = os.path.expanduser(path_test) # path_c_test is a c file - elif os.path.isfile(full_path) or args.debug: - run_c(full_path, args.iss_yaml, args.isa, args.target, args.mabi, args.gcc_opts, + if os.path.isfile(full_path) or args.debug: + run_test(full_path, args.iss_yaml, args.isa, args.target, args.mabi, args.gcc_opts, args.iss, output_dir, args.core_setting_dir, args.debug, args.linker, args.priv, args.spike_params, iss_timeout=args.iss_timeout) else: - logging.error('%s does not exist' % full_path) - sys.exit(RET_FAIL) - test_executed = 1 - - # Run any handcoded/directed elf tests specified by args.elf_tests - if args.elf_tests != "": - elf_test = args.elf_tests.split(',') - for path_elf_test in elf_test: - full_path = os.path.expanduser(path_elf_test) - # path_elf_test is an elf file - if os.path.isfile(full_path) or args.debug: - run_elf(full_path, args.iss_yaml, args.isa, args.target, args.mabi, args.gcc_opts, - args.iss, output_dir, args.core_setting_dir, args.debug, args.priv, args.spike_params, iss_timeout=args.iss_timeout) - else: - logging.error('%s does not exist' % full_path) + logging.error('%s does not exist or is not a file' % full_path) sys.exit(RET_FAIL) test_executed = 1 @@ -1309,7 +1183,7 @@ def main(): if test_executed ==0: if not args.co: openhw_process_regression_list(args.testlist, args.test, args.iterations, matched_list, cwd) - logging.info('CVA6 Configuration is %s'% args.hwconfig_opts) + logging.info('CVA6 Configuration is %s and target is %s'% (args.hwconfig_opts, args.target)) for entry in list(matched_list): yaml_needs = entry["needs"] if "needs" in entry else [] if yaml_needs: @@ -1354,58 +1228,24 @@ def main(): run_cmd("%s" % copy) t['c_tests'] = re.sub(r'(.*)\/(.*).c$', r'\1/', t['c_tests'])+t['test']+'.c' + directed_tests_list = asm_directed_list + c_directed_list # Run instruction generator if args.steps == "all" or re.match(".*gen.*", args.steps): - # Run any handcoded/directed assembly tests specified in YAML format - if len(asm_directed_list) != 0: + # Run any handcoded/directed tests specified in YAML format + if len(directed_tests_list) != 0: for test_entry in asm_directed_list: gcc_opts = args.gcc_opts gcc_opts += test_entry.get('gcc_opts', '') - path_asm_test = os.path.expanduser(test_entry.get('asm_tests')) - if path_asm_test: - # path_asm_test is a directory - if os.path.isdir(path_asm_test): - run_assembly_from_dir(path_asm_test, args.iss_yaml, args.isa, args.mabi, - gcc_opts, args.iss, output_dir, - args.core_setting_dir, args.debug, args.priv, iss_timeout=args.iss_timeout) - # path_asm_test is an assembly file - elif os.path.isfile(path_asm_test): - run_assembly(path_asm_test, args.iss_yaml, args.isa, args.target, args.mabi, gcc_opts, + path_test = os.path.expanduser(test_entry.get('asm_tests')) + if path_test: + # path_test is an assembly file + if os.path.isfile(path_test): + run_test(path_test, args.iss_yaml, args.isa, args.target, args.mabi, gcc_opts, args.iss, output_dir, args.core_setting_dir, args.debug, args.linker, - args.priv, args.spike_params, test_entry['test'], iss_timeout=args.iss_timeout) - else: - if not args.debug: - logging.error('%s does not exist' % path_asm_test) - sys.exit(RET_FAIL) - - # Run any handcoded/directed C tests specified in YAML format - if len(c_directed_list) != 0: - for test_entry in c_directed_list: - gcc_opts = args.gcc_opts - gcc_opts += test_entry.get('gcc_opts', '') - - if 'sim_do' in test_entry: - sim_do = test_entry['sim_do'].split(';') - with open("sim.do", "w") as fd: - for cmd in sim_do: - fd.write(cmd + "\n") - logging.info('sim.do: %s' % sim_do) - - path_c_test = os.path.expanduser(test_entry.get('c_tests')) - if path_c_test: - # path_c_test is a directory - if os.path.isdir(path_c_test): - run_c_from_dir(path_c_test, args.iss_yaml, args.isa, args.mabi, - gcc_opts, args.iss, output_dir, - args.core_setting_dir, args.debug, args.priv, args.iss_timeout) - # path_c_test is a C file - elif os.path.isfile(path_c_test): - run_c(path_c_test, args.iss_yaml, args.isa, args.target, args.mabi, gcc_opts, - args.iss, output_dir, args.core_setting_dir, args.debug, args.linker, - args.priv, args.spike_params, test_entry['test'], iss_timeout=args.iss_timeout) + args.priv, args.spike_params, test_entry['test'], iss_timeout=args.iss_timeout, testlist=args.testlist) else: if not args.debug: - logging.error('%s does not exist' % path_c_test) + logging.error('%s does not exist' % path_test) sys.exit(RET_FAIL) # Run remaining tests using the instruction generator diff --git a/verif/sim/cva6.yaml b/verif/sim/cva6.yaml index 551707db89..6011296027 100644 --- a/verif/sim/cva6.yaml +++ b/verif/sim/cva6.yaml @@ -18,7 +18,7 @@ # Always keep this value in sync with the settings of RTL simulators (cf. # values below). cmd: > - make spike steps=2000000 variant= priv= elf= tool_path= log= spike_params='' + make spike steps=2000000 target= variant= priv= elf= tool_path= log= spike_params='' ############################################################################### # Verilator @@ -61,6 +61,13 @@ cmd: > make vcs-uvm target= cov=${cov} variant= elf= tool_path= isscomp_opts= issrun_opts= isspostrun_opts= log= +- iss: vcs-uvm-gate + path_var: RTL_PATH + tool_path: SPIKE_PATH + tb_path: TB_PATH + cmd: > + make vcs-uvm target= gate=1 cov=${cov} variant= elf= tool_path= isscomp_opts= issrun_opts= isspostrun_opts= log= + - iss: questa-uvm path_var: RTL_PATH tool_path: SPIKE_PATH diff --git a/verif/sim/cva6_spike_log_to_trace_csv.py b/verif/sim/cva6_spike_log_to_trace_csv.py index a16b515e66..f7558a4382 100644 --- a/verif/sim/cva6_spike_log_to_trace_csv.py +++ b/verif/sim/cva6_spike_log_to_trace_csv.py @@ -121,9 +121,10 @@ def read_spike_trace(path, full_trace): # true. Otherwise, we are in state EFFECT if instr is not None, otherwise we # are in state INSTR. + start_trampoline_re = re.compile(r'core.*: 0x0*10000 ') end_trampoline_re = re.compile(r'core.*: 0x0*10010 ') - in_trampoline = True + in_trampoline = False instr = None with open(path, 'r') as handle: @@ -133,6 +134,9 @@ def read_spike_trace(path, full_trace): if end_trampoline_re.match(line): in_trampoline = False continue + elif start_trampoline_re.match(line): + in_trampoline = True + continue if instr is None: # The INSTR state. We expect to see a line matching CORE_RE. diff --git a/verif/tb/core/uvma_cva6pkg_utils.sv b/verif/tb/core/uvma_cva6pkg_utils.sv index f2732098e3..0c5c86b460 100644 --- a/verif/tb/core/uvma_cva6pkg_utils.sv +++ b/verif/tb/core/uvma_cva6pkg_utils.sv @@ -19,14 +19,14 @@ function st_core_cntrl_cfg cva6pkg_to_core_cntrl_cfg(st_core_cntrl_cfg cfg); cfg.ext_a_supported = CVA6Cfg.RVA; cfg.ext_m_supported = 1; cfg.ext_c_supported = CVA6Cfg.RVC; - cfg.ext_p_supported = 1; + cfg.ext_p_supported = 0; cfg.ext_v_supported = CVA6Cfg.RVV; cfg.ext_f_supported = CVA6Cfg.RVF; cfg.ext_d_supported = CVA6Cfg.RVD; cfg.ext_zba_supported = CVA6Cfg.RVB; cfg.ext_zbb_supported = CVA6Cfg.RVB; cfg.ext_zbc_supported = CVA6Cfg.RVB; - cfg.ext_zbe_supported = CVA6Cfg.RVB; + cfg.ext_zbe_supported = 0; cfg.ext_zbf_supported = 0; cfg.ext_zbm_supported = 0; cfg.ext_zbp_supported = 0; @@ -38,12 +38,12 @@ function st_core_cntrl_cfg cva6pkg_to_core_cntrl_cfg(st_core_cntrl_cfg cfg); cfg.ext_zicsr_supported = 1; cfg.ext_zicntr_supported = 0; - cfg.ext_cv32a60x_supported = 1; + cfg.ext_cv32a60x_supported = 0; // FIXME TODO: Temporary solution. We need explicit info on memory map. // FORNOW The solution below relies on specific region ordering. - cfg.dram_base = CVA6Cfg.ExecuteRegionAddrBase[2]; - cfg.dram_size = CVA6Cfg.ExecuteRegionLength[2]; + cfg.dram_base = 'h40000000; + cfg.dram_size = 'h80000000; cfg.dram_valid = 1; cfg.disable_all_csr_checks = 0; @@ -78,7 +78,7 @@ function st_core_cntrl_cfg cva6pkg_to_core_cntrl_cfg(st_core_cntrl_cfg cfg); void'(spike_set_param_bool(base, "hide_csrs_based_on_priv", 1)); void'(spike_set_param_uint64_t(base, "mtvec_vectored_alignment", 64 * 4)); - void'(spike_set_param_str(base, "extensions", "cv32a60x")); + void'(spike_set_param_str(base, "extensions", "cvxif")); // All enabled except XS and TW bits void'(spike_set_param_uint64_t(base, "mstatus_write_mask", 'hFFDE_7FFF)); diff --git a/verif/tb/uvmt/cva6_tb_wrapper.sv b/verif/tb/uvmt/cva6_tb_wrapper.sv index a592d45c90..6a063ef844 100644 --- a/verif/tb/uvmt/cva6_tb_wrapper.sv +++ b/verif/tb/uvmt/cva6_tb_wrapper.sv @@ -29,6 +29,8 @@ import uvm_pkg::*; `include "uvm_macros.svh" +`include "cvxif_types.svh" + `ifndef DPI_FESVR_SPIKE_UTILS `define DPI_FESVR_SPIKE_UTILS @@ -46,19 +48,32 @@ module cva6_tb_wrapper import uvmt_cva6_pkg::*; #( parameter type rvfi_probes_instr_t = logic, parameter type rvfi_probes_csr_t = logic, parameter type rvfi_probes_t = logic, + // CVXIF Types + localparam type readregflags_t = `READREGFLAGS_T(CVA6Cfg), + localparam type writeregflags_t = `WRITEREGFLAGS_T(CVA6Cfg), + localparam type id_t = `ID_T(CVA6Cfg), + localparam type hartid_t = `HARTID_T(CVA6Cfg), + localparam type x_compressed_req_t = `X_COMPRESSED_REQ_T(CVA6Cfg, hartid_t), + localparam type x_compressed_resp_t = `X_COMPRESSED_RESP_T(CVA6Cfg), + localparam type x_issue_req_t = `X_ISSUE_REQ_T(CVA6Cfg, hartit_t, id_t), + localparam type x_issue_resp_t = `X_ISSUE_RESP_T(CVA6Cfg, writeregflags_t, readregflags_t), + localparam type x_register_t = `X_REGISTER_T(CVA6Cfg, hartid_t, id_t, readregflags_t), + localparam type x_commit_t = `X_COMMIT_T(CVA6Cfg, hartid_t, id_t), + localparam type x_result_t = `X_RESULT_T(CVA6Cfg, hartid_t, id_t, writeregflags_t), + localparam type cvxif_req_t = `CVXIF_REQ_T(CVA6Cfg, x_compressed_req_t, x_issue_req_t, x_register_req_t, x_commit_t), + localparam type cvxif_resp_t = `CVXIF_RESP_T(CVA6Cfg, x_compressed_resp_t, x_issue_resp_t, x_result_t), // parameter int unsigned AXI_USER_EN = 0, parameter int unsigned NUM_WORDS = 2**25 ) ( input logic clk_i, input logic rst_ni, - input logic [riscv::VLEN-1:0] boot_addr_i, + input logic [CVA6Cfg.VLEN-1:0] boot_addr_i, output logic [31:0] tb_exit_o, output rvfi_instr_t [CVA6Cfg.NrCommitPorts-1:0] rvfi_o, output rvfi_csr_t rvfi_csr_o, - input cvxif_pkg::cvxif_resp_t cvxif_resp, - output cvxif_pkg::cvxif_req_t cvxif_req, - input logic [2:0] irq_i, + input logic [15:0] irq_i, + uvma_debug_if debug_if, uvma_axi_intf axi_slave, uvmt_axi_switch_intf axi_switch_vif, uvmt_default_inputs_intf default_inputs_vif @@ -76,6 +91,9 @@ module cva6_tb_wrapper import uvmt_cva6_pkg::*; #( assign rvfi_o = rvfi_instr; assign rvfi_csr_o = rvfi_csr; + cvxif_req_t cvxif_req; + cvxif_resp_t cvxif_resp; + cva6 #( .CVA6Cfg ( CVA6Cfg ), .rvfi_probes_instr_t ( rvfi_probes_instr_t ), @@ -87,16 +105,25 @@ module cva6_tb_wrapper import uvmt_cva6_pkg::*; #( .boot_addr_i ( boot_addr_i ),//Driving the boot_addr value from the core control agent .hart_id_i ( default_inputs_vif.hart_id ), .irq_i ( {1'b0, irq_i[0]} ), - .ipi_i ( irq_i[1] ), - .time_irq_i ( irq_i[2] ), - .debug_req_i ( default_inputs_vif.debug_req ), + .ipi_i ( 1'b0 ), + .time_irq_i ( irq_i[1] ), + .debug_req_i ( debug_if.debug_req ), .rvfi_probes_o ( rvfi_probes ), .cvxif_req_o ( cvxif_req ), .cvxif_resp_i ( cvxif_resp ), - .noc_req_o ( axi_ariane_req ), - .noc_resp_i ( axi_ariane_resp ) + .noc_req_o ( axi_ariane_req ), + .noc_resp_i ( axi_ariane_resp ) ); + if (CVA6Cfg.CvxifEn) begin : gen_cvxif_default_response + always_comb begin + cvxif_resp = '0; + cvxif_resp.compressed_ready = 1'b1; + cvxif_resp.issue_ready = 1'b1; + cvxif_resp.register_ready = 1'b1; + end + end + //---------------------------------------------------------------------------- // RVFI //---------------------------------------------------------------------------- @@ -162,21 +189,21 @@ module cva6_tb_wrapper import uvmt_cva6_pkg::*; #( assign axi_ariane_resp.r.last = (axi_switch_vif.active) ? axi_slave.r_last : cva6_axi_bus.r_last; assign axi_ariane_resp.r.user = (axi_switch_vif.active) ? axi_slave.r_user : cva6_axi_bus.r_user; - assign axi_slave.aw_ready = (axi_switch_vif.active) ? axi_slave.aw_ready : cva6_axi_bus.aw_ready; - assign axi_slave.ar_ready = (axi_switch_vif.active) ? axi_slave.ar_ready : cva6_axi_bus.ar_ready; - assign axi_slave.w_ready = (axi_switch_vif.active) ? axi_slave.w_ready : cva6_axi_bus.w_ready; - assign axi_slave.b_valid = (axi_switch_vif.active) ? axi_slave.b_valid : cva6_axi_bus.b_valid; - assign axi_slave.r_valid = (axi_switch_vif.active) ? axi_slave.r_valid : cva6_axi_bus.r_valid; - - assign axi_slave.b_id = (axi_switch_vif.active) ? axi_slave.b_id : cva6_axi_bus.b_id; - assign axi_slave.b_resp = (axi_switch_vif.active) ? axi_slave.b_resp : cva6_axi_bus.b_resp; - assign axi_slave.b_user = (axi_switch_vif.active) ? axi_slave.b_user : cva6_axi_bus.b_user; - - assign axi_slave.r_id = (axi_switch_vif.active) ? axi_slave.r_id : cva6_axi_bus.r_id; - assign axi_slave.r_data = (axi_switch_vif.active) ? axi_slave.r_data : cva6_axi_bus.r_data; - assign axi_slave.r_resp = (axi_switch_vif.active) ? axi_slave.r_resp : cva6_axi_bus.r_resp; - assign axi_slave.r_last = (axi_switch_vif.active) ? axi_slave.r_last : cva6_axi_bus.r_last; - assign axi_slave.r_user = (axi_switch_vif.active) ? axi_slave.r_user : cva6_axi_bus.r_user; + assign axi_slave.aw_ready = (axi_switch_vif.active) ? 'z : cva6_axi_bus.aw_ready; + assign axi_slave.ar_ready = (axi_switch_vif.active) ? 'z : cva6_axi_bus.ar_ready; + assign axi_slave.w_ready = (axi_switch_vif.active) ? 'z : cva6_axi_bus.w_ready; + assign axi_slave.b_valid = (axi_switch_vif.active) ? 'z : cva6_axi_bus.b_valid; + assign axi_slave.r_valid = (axi_switch_vif.active) ? 'z : cva6_axi_bus.r_valid; + + assign axi_slave.b_id = (axi_switch_vif.active) ? 'z : cva6_axi_bus.b_id; + assign axi_slave.b_resp = (axi_switch_vif.active) ? 'z : cva6_axi_bus.b_resp; + assign axi_slave.b_user = (axi_switch_vif.active) ? 'z : cva6_axi_bus.b_user; + + assign axi_slave.r_id = (axi_switch_vif.active) ? 'z : cva6_axi_bus.r_id; + assign axi_slave.r_data = (axi_switch_vif.active) ? 'z : cva6_axi_bus.r_data; + assign axi_slave.r_resp = (axi_switch_vif.active) ? 'z : cva6_axi_bus.r_resp; + assign axi_slave.r_last = (axi_switch_vif.active) ? 'z : cva6_axi_bus.r_last; + assign axi_slave.r_user = (axi_switch_vif.active) ? 'z : cva6_axi_bus.r_user; // Request structs assign axi_slave.aw_valid = axi_ariane_req.aw_valid; diff --git a/verif/tb/uvmt/uvmt_cva6.flist b/verif/tb/uvmt/uvmt_cva6.flist index c42ca778eb..d4f7651108 100644 --- a/verif/tb/uvmt/uvmt_cva6.flist +++ b/verif/tb/uvmt/uvmt_cva6.flist @@ -23,7 +23,7 @@ // Agents -f ${DV_UVMA_CLKNRST_PATH}/uvma_clknrst_pkg.flist --f ${DV_UVMA_CVXIF_PATH}/src/uvma_cvxif_pkg.flist +-f ${DV_UVMA_DEBUG_PATH}/uvma_debug_pkg.flist -f ${DV_UVMA_AXI_PATH}/src/uvma_axi_pkg.flist -f ${DV_UVMA_CORE_CNTRL_PATH}/uvma_core_cntrl_pkg.flist -f ${DV_UVMA_RVFI_PATH}/uvma_rvfi_pkg.flist diff --git a/verif/tb/uvmt/uvmt_cva6_axi_assert.sv b/verif/tb/uvmt/uvmt_cva6_axi_assert.sv index e2758f37dc..ab3472d5d6 100644 --- a/verif/tb/uvmt/uvmt_cva6_axi_assert.sv +++ b/verif/tb/uvmt/uvmt_cva6_axi_assert.sv @@ -18,42 +18,42 @@ module uvmt_cva6_axi_assert#(int HPDCache=2) //check if the CVA6 identify read transaction with an ID equal to 0 or 1 property AXI4_CVA6_ARID; - @(posedge axi_assert_if.clk && (HPDCache != 2)) disable iff (!axi_assert_if.rst_n) axi_assert_if.ar_valid |-> axi_assert_if.ar_id == 0 || axi_assert_if.ar_id == 1 || (axi_assert_if.ar_id == 3 && axi_assert_if.ar_lock == 1); + @(posedge axi_assert_if.clk && (HPDCache != 2) && axi_assert_if.axi_assertion_enabled) disable iff (!axi_assert_if.rst_n) axi_assert_if.ar_valid |-> axi_assert_if.ar_id == 0 || axi_assert_if.ar_id == 1 || (axi_assert_if.ar_id == 3 && axi_assert_if.ar_lock == 1); endproperty //check if the CVA6 identify write transaction with an ID equal to 0 or 1 property AXI4_CVA6_AWID; - @(posedge axi_assert_if.clk && (HPDCache != 2)) disable iff (!axi_assert_if.rst_n) axi_assert_if.aw_valid |-> axi_assert_if.aw_id == 1 || (axi_assert_if.aw_id == 3 && axi_assert_if.aw_atop != 0) || (axi_assert_if.aw_id == 3 && axi_assert_if.aw_lock == 1); + @(posedge axi_assert_if.clk && (HPDCache != 2) && axi_assert_if.axi_assertion_enabled) disable iff (!axi_assert_if.rst_n) axi_assert_if.aw_valid |-> axi_assert_if.aw_id == 1 || (axi_assert_if.aw_id == 3 && axi_assert_if.aw_atop != 0) || (axi_assert_if.aw_id == 3 && axi_assert_if.aw_lock == 1); endproperty //Check if user-defined extension for read address channel is equal to 0b00 property AXI4_CVA6_ARUSER; - @(posedge axi_assert_if.clk) disable iff (!axi_assert_if.rst_n) axi_assert_if.ar_valid |-> axi_assert_if.ar_user == 0; + @(posedge axi_assert_if.clk && axi_assert_if.axi_assertion_enabled) disable iff (!axi_assert_if.rst_n) axi_assert_if.ar_valid |-> axi_assert_if.ar_user == 0; endproperty //Check if user-defined extension for write address channel is equal to 0b00 property AXI4_CVA6_AWUSER; - @(posedge axi_assert_if.clk) disable iff (!axi_assert_if.rst_n) axi_assert_if.aw_valid |-> axi_assert_if.aw_user == 0; + @(posedge axi_assert_if.clk && axi_assert_if.axi_assertion_enabled) disable iff (!axi_assert_if.rst_n) axi_assert_if.aw_valid |-> axi_assert_if.aw_user == 0; endproperty //Check if Quality of Service identifier for write transaction is equal to 0b0000 property AXI4_CVA6_AWQOS; - @(posedge axi_assert_if.clk) disable iff (!axi_assert_if.rst_n) axi_assert_if.aw_valid |-> axi_assert_if.aw_qos == 0; + @(posedge axi_assert_if.clk && axi_assert_if.axi_assertion_enabled) disable iff (!axi_assert_if.rst_n) axi_assert_if.aw_valid |-> axi_assert_if.aw_qos == 0; endproperty //Check if Quality of Service identifier for read transaction is equal to 0b0000 property AXI4_CVA6_ARQOS; - @(posedge axi_assert_if.clk) disable iff (!axi_assert_if.rst_n) axi_assert_if.ar_valid |-> axi_assert_if.ar_qos == 0; + @(posedge axi_assert_if.clk && axi_assert_if.axi_assertion_enabled) disable iff (!axi_assert_if.rst_n) axi_assert_if.ar_valid |-> axi_assert_if.ar_qos == 0; endproperty //Check if Region indicator for write transaction is equal to 0b0000 property AXI4_CVA6_AWREGION; - @(posedge axi_assert_if.clk) disable iff (!axi_assert_if.rst_n) axi_assert_if.aw_valid |-> axi_assert_if.aw_region == 0; + @(posedge axi_assert_if.clk && axi_assert_if.axi_assertion_enabled) disable iff (!axi_assert_if.rst_n) axi_assert_if.aw_valid |-> axi_assert_if.aw_region == 0; endproperty //Check if Region indicator for read transaction is equal to 0b0000 property AXI4_CVA6_ARREGION; - @(posedge axi_assert_if.clk) disable iff (!axi_assert_if.rst_n) axi_assert_if.ar_valid |-> axi_assert_if.ar_region == 0; + @(posedge axi_assert_if.clk && axi_assert_if.axi_assertion_enabled) disable iff (!axi_assert_if.rst_n) axi_assert_if.ar_valid |-> axi_assert_if.ar_region == 0; endproperty //Check if AWCACHE is always equal to 0b0000 @@ -63,42 +63,42 @@ module uvmt_cva6_axi_assert#(int HPDCache=2) //Check if ARCACHE is always equal to 0b0000 property AXI4_CVA6_ARCACHE; - @(posedge axi_assert_if.clk && (HPDCache != 2)) disable iff (!axi_assert_if.rst_n) axi_assert_if.ar_valid |-> axi_assert_if.ar_cache == 2; + @(posedge axi_assert_if.clk && (HPDCache != 2) && axi_assert_if.axi_assertion_enabled) disable iff (!axi_assert_if.rst_n) axi_assert_if.ar_valid |-> axi_assert_if.ar_cache == 2; endproperty //Check if Protection attributes for write transaction always take the 0b000 property AXI4_CVA6_AWPROT; - @(posedge axi_assert_if.clk) disable iff (!axi_assert_if.rst_n) axi_assert_if.aw_valid |-> axi_assert_if.aw_prot == 0; + @(posedge axi_assert_if.clk && axi_assert_if.axi_assertion_enabled) disable iff (!axi_assert_if.rst_n) axi_assert_if.aw_valid |-> axi_assert_if.aw_prot == 0; endproperty //Check if Protection attributes for read transaction always take the 0b000 property AXI4_CVA6_ARPROT; - @(posedge axi_assert_if.clk) disable iff (!axi_assert_if.rst_n) axi_assert_if.ar_valid |-> axi_assert_if.ar_prot == 0; + @(posedge axi_assert_if.clk && axi_assert_if.axi_assertion_enabled) disable iff (!axi_assert_if.rst_n) axi_assert_if.ar_valid |-> axi_assert_if.ar_prot == 0; endproperty //Check if all write transaction performed by CVA6 are of type INCR property AXI4_CVA6_AWBURST; - @(posedge axi_assert_if.clk) disable iff (!axi_assert_if.rst_n) axi_assert_if.aw_valid |-> axi_assert_if.aw_burst == 1; + @(posedge axi_assert_if.clk && axi_assert_if.axi_assertion_enabled) disable iff (!axi_assert_if.rst_n) axi_assert_if.aw_valid |-> axi_assert_if.aw_burst == 1; endproperty //Check if all read transaction performed by CVA6 are of type INCR property AXI4_CVA6_ARBURST; - @(posedge axi_assert_if.clk) disable iff (!axi_assert_if.rst_n) axi_assert_if.ar_valid |-> axi_assert_if.ar_burst == 1; + @(posedge axi_assert_if.clk && axi_assert_if.axi_assertion_enabled) disable iff (!axi_assert_if.rst_n) axi_assert_if.ar_valid |-> axi_assert_if.ar_burst == 1; endproperty //Check if all write transaction performed by CVA6 are equal to 0 property AXI4_CVA6_AWLEN; - @(posedge axi_assert_if.clk) disable iff (!axi_assert_if.rst_n) axi_assert_if.aw_valid |-> axi_assert_if.aw_len == 0; + @(posedge axi_assert_if.clk && axi_assert_if.axi_assertion_enabled) disable iff (!axi_assert_if.rst_n) axi_assert_if.aw_valid |-> axi_assert_if.aw_len == 0; endproperty //Check if all Read transaction performed by CVA6 are equal to 0 or 1 property AXI4_CVA6_ARLEN; - @(posedge axi_assert_if.clk) disable iff (!axi_assert_if.rst_n) axi_assert_if.ar_valid |-> axi_assert_if.ar_len == 0 || axi_assert_if.ar_len == 1; + @(posedge axi_assert_if.clk && axi_assert_if.axi_assertion_enabled) disable iff (!axi_assert_if.rst_n) axi_assert_if.ar_valid |-> axi_assert_if.ar_len == 0 || axi_assert_if.ar_len == 1; endproperty //Check if all Write transaction performed by CVA6 are of type Non atomic, AtomicLoad or AtomicSwap property AXI4_CVA6_AWATOP; - @(posedge axi_assert_if.clk) disable iff (!axi_assert_if.rst_n) axi_assert_if.aw_valid |-> (axi_assert_if.aw_atop[5:4] == 0 || axi_assert_if.aw_atop[5:4] == 2 || axi_assert_if.aw_atop[5:4] == 3) && axi_assert_if.aw_atop[3] == 0; + @(posedge axi_assert_if.clk && axi_assert_if.axi_assertion_enabled) disable iff (!axi_assert_if.rst_n) axi_assert_if.aw_valid |-> (axi_assert_if.aw_atop[5:4] == 0 || axi_assert_if.aw_atop[5:4] == 2 || axi_assert_if.aw_atop[5:4] == 3) && axi_assert_if.aw_atop[3] == 0; endproperty /********************************************** Assert Property ******************************************************/ diff --git a/verif/tb/uvmt/uvmt_cva6_dut_wrap.sv b/verif/tb/uvmt/uvmt_cva6_dut_wrap.sv index 9e8bb9d2e4..500afcf45e 100644 --- a/verif/tb/uvmt/uvmt_cva6_dut_wrap.sv +++ b/verif/tb/uvmt/uvmt_cva6_dut_wrap.sv @@ -29,12 +29,12 @@ module uvmt_cva6_dut_wrap # ( ( uvma_clknrst_if clknrst_if, - uvma_cvxif_intf cvxif_if, uvma_axi_intf axi_if, uvmt_axi_switch_intf axi_switch_vif, uvmt_default_inputs_intf default_inputs_vif, uvme_cva6_core_cntrl_if core_cntrl_if, uvma_interrupt_if interrupt_vif, + uvma_debug_if debug_if, output logic[31:0] tb_exit_o, output rvfi_instr_t [CVA6Cfg.NrCommitPorts-1:0] rvfi_o, output rvfi_csr_t rvfi_csr_o @@ -59,9 +59,8 @@ module uvmt_cva6_dut_wrap # ( .clk_i ( clknrst_if.clk ), .rst_ni ( clknrst_if.reset_n ), .boot_addr_i ( boot_addr ), - .cvxif_resp ( cvxif_if.cvxif_resp_o ), - .cvxif_req ( cvxif_if.cvxif_req_i ), .irq_i ( interrupt_vif.irq ), + .debug_if ( debug_if ), .axi_slave ( axi_if ), .axi_switch_vif ( axi_switch_vif ), .default_inputs_vif ( default_inputs_vif ), @@ -70,16 +69,4 @@ module uvmt_cva6_dut_wrap # ( .rvfi_o ( rvfi_o ) ); - assign cvxif_if.cvxif_resp_o.x_compressed_ready = 0; - assign cvxif_if.cvxif_resp_o.x_compressed_resp = 0; - assign cvxif_if.cvxif_resp_o.x_issue_ready = 1; - assign cvxif_if.cvxif_resp_o.x_issue_resp = 0; - assign cvxif_if.cvxif_resp_o.x_result_valid = 0; - assign cvxif_if.cvxif_resp_o.x_result.id = 0; - assign cvxif_if.cvxif_resp_o.x_result.data = 0; - assign cvxif_if.cvxif_resp_o.x_result.rd = 0; - assign cvxif_if.cvxif_resp_o.x_result.we = 0; - assign cvxif_if.cvxif_resp_o.x_result.exc = 0; - assign cvxif_if.cvxif_resp_o.x_result.exccode = 0; - endmodule diff --git a/verif/tb/uvmt/uvmt_cva6_tb.sv b/verif/tb/uvmt/uvmt_cva6_tb.sv index 11c0e54f9e..5955f53697 100644 --- a/verif/tb/uvmt/uvmt_cva6_tb.sv +++ b/verif/tb/uvmt/uvmt_cva6_tb.sv @@ -59,18 +59,16 @@ module uvmt_cva6_tb; // Agent interfaces uvma_clknrst_if clknrst_if(); // clock and resets from the clknrst agent - uvma_cvxif_intf cvxif_if( - .clk(clknrst_if.clk), - .reset_n(clknrst_if.reset_n) - ); // cvxif from the cvxif agent + uvma_debug_if debug_if(); // debug uvma_axi_intf axi_if( .clk(clknrst_if.clk), .rst_n(clknrst_if.reset_n) ); - uvma_interrupt_if - interrupt_vif( - ); + uvma_interrupt_if interrupt_vif( + .clk(clknrst_if.clk), + .reset_n(clknrst_if.reset_n) + ); uvmt_axi_switch_intf axi_switch_vif(); uvme_cva6_core_cntrl_if core_cntrl_if(); @@ -83,12 +81,6 @@ module uvmt_cva6_tb; uvmt_default_inputs_intf default_inputs_vif(); - //bind assertion module for cvxif interface - bind uvmt_cva6_dut_wrap - uvma_cvxif_assert cvxif_assert(.cvxif_assert(cvxif_if), - .clk(clknrst_if.clk), - .reset_n(clknrst_if.reset_n) - ); //bind assertion module for axi interface bind uvmt_cva6_dut_wrap uvmt_axi_assert #(CVA6Cfg.DCacheType) axi_assert(.axi_assert_if(axi_if)); @@ -105,6 +97,8 @@ module uvmt_cva6_tb; ); // Status information generated by the Virtual Peripherals in the DUT WRAPPER memory. uvmt_tb_exit_if tb_exit_if ( .tb_exit_o()); + assign debug_if.clk = clknrst_if.clk; + assign debug_if.reset_n = clknrst_if.reset_n; /** * DUT WRAPPER instance */ @@ -122,7 +116,7 @@ module uvmt_cva6_tb; .NUM_WORDS (NUM_WORDS) ) cva6_dut_wrap ( .clknrst_if(clknrst_if), - .cvxif_if (cvxif_if), + .debug_if(debug_if), .axi_if (axi_if), .axi_switch_vif (axi_switch_vif), .default_inputs_vif (default_inputs_vif), @@ -357,21 +351,32 @@ module uvmt_cva6_tb; * Test bench entry point. */ initial begin : test_bench_entry_point + bit axi_assert_on; // Specify time format for simulation (units_number, precision_number, suffix_string, minimum_field_width) $timeformat(-9, 3, " ns", 8); - axi_if.aw_assertion_enabled = 1; - axi_if.w_assertion_enabled = 1; - axi_if.b_assertion_enabled = 1; - axi_if.ar_assertion_enabled = 1; - axi_if.r_assertion_enabled = 1; - axi_if.axi_assertion_enabled = 1; - axi_if.axi_amo_assertion_enabled = 1; + if($value$plusargs("uvmt_set_axi_assert_cfg=%0d", axi_assert_on)) begin + axi_if.aw_assertion_enabled = axi_assert_on; + axi_if.w_assertion_enabled = axi_assert_on; + axi_if.b_assertion_enabled = axi_assert_on; + axi_if.ar_assertion_enabled = axi_assert_on; + axi_if.r_assertion_enabled = axi_assert_on; + axi_if.axi_assertion_enabled = axi_assert_on; + axi_if.axi_amo_assertion_enabled = axi_assert_on; + end else begin + axi_if.aw_assertion_enabled = 1; + axi_if.w_assertion_enabled = 1; + axi_if.b_assertion_enabled = 1; + axi_if.ar_assertion_enabled = 1; + axi_if.r_assertion_enabled = 1; + axi_if.axi_assertion_enabled = 1; + axi_if.axi_amo_assertion_enabled = 1; + end // Add interfaces handles to uvm_config_db uvm_config_db#(virtual uvma_clknrst_if )::set(.cntxt(null), .inst_name("*.env.clknrst_agent"), .field_name("vif"), .value(clknrst_if)); - uvm_config_db#(virtual uvma_cvxif_intf )::set(.cntxt(null), .inst_name("*.env.cvxif_agent"), .field_name("vif"), .value(cvxif_if) ); + uvm_config_db#(virtual uvma_debug_if )::set(.cntxt(null), .inst_name("*.env"), .field_name("debug_vif"), .value(debug_if)); uvm_config_db#(virtual uvma_axi_intf )::set(.cntxt(null), .inst_name("*"), .field_name("axi_vif"), .value(axi_if)); uvm_config_db#(virtual uvmt_axi_switch_intf )::set(.cntxt(null), .inst_name("*.env"), .field_name("axi_switch_vif"), .value(axi_switch_vif)); uvm_config_db#(virtual uvmt_rvfi_if#( .CVA6Cfg(CVA6Cfg), .rvfi_instr_t(rvfi_instr_t), .rvfi_csr_t (rvfi_csr_t)))::set(.cntxt(null), .inst_name("*"), .field_name("rvfi_vif"), .value(rvfi_if)); @@ -385,9 +390,6 @@ module uvmt_cva6_tb; uvm_config_db#(int)::set(.cntxt(null), .inst_name("*"), .field_name("ENV_PARAM_INSTR_DATA_WIDTH"), .value(ENV_PARAM_INSTR_DATA_WIDTH) ); uvm_config_db#(int)::set(.cntxt(null), .inst_name("*"), .field_name("ENV_PARAM_RAM_ADDR_WIDTH"), .value(ENV_PARAM_RAM_ADDR_WIDTH) ); - // Set RTL parameters - uvm_config_db#(config_pkg::cva6_cfg_t)::set(.cntxt(null), .inst_name("*.env"), .field_name("CVA6Cfg"), .value(CVA6Cfg) ); - // Run test uvm_top.enable_print_topology = 0; // ENV coders enable this as a debug aid uvm_top.finish_on_completion = 1; diff --git a/verif/tests/custom/Zcmp/link.ld b/verif/tests/custom/Zcmp/link.ld index 6135e79df0..67e7d6cf05 100644 --- a/verif/tests/custom/Zcmp/link.ld +++ b/verif/tests/custom/Zcmp/link.ld @@ -28,6 +28,9 @@ SECTIONS . = ALIGN(0x1000); .tohost : { *(.tohost) } + . = ALIGN(0x1000); + .uvmif : { *(.uvmif) } + . = ALIGN(0x1000); .text : { *(.text) } diff --git a/verif/tests/custom/Zcmp/riscv_test.h b/verif/tests/custom/Zcmp/riscv_test.h index 38f5cb36cf..6fe83e415b 100644 --- a/verif/tests/custom/Zcmp/riscv_test.h +++ b/verif/tests/custom/Zcmp/riscv_test.h @@ -262,6 +262,9 @@ reset_vector: \ .align 6; .global tohost; tohost: .dword 0; \ .align 6; .global fromhost; fromhost: .dword 0; \ .popsection; \ + .pushsection .uvmif,"aw",@progbits; \ + .align 6; .global int_ack; int_ack: .dword 0; \ + .popsection; \ .align 4; .global begin_signature; begin_signature: #define RVTEST_DATA_END .align 4; .global end_signature; end_signature: diff --git a/verif/tests/custom/common/crt.S b/verif/tests/custom/common/crt.S index 799b82e6c8..d808114d21 100644 --- a/verif/tests/custom/common/crt.S +++ b/verif/tests/custom/common/crt.S @@ -260,3 +260,9 @@ tohost: .dword 0 .align 6 .globl fromhost fromhost: .dword 0 + +.section ".uvmif","aw",@progbits +# Alignment is 2**6 == 64 bytes +.align 6 +.globl int_ack +int_ack: .dword 0 diff --git a/verif/tests/custom/coremark/core_main.c b/verif/tests/custom/coremark/coremark_main.c similarity index 100% rename from verif/tests/custom/coremark/core_main.c rename to verif/tests/custom/coremark/coremark_main.c diff --git a/verif/tests/custom/cv_xif/cvxif_add_nop.S b/verif/tests/custom/cv_xif/cvxif_add_nop.S index 6a88eaa667..ead9aa4e93 100644 --- a/verif/tests/custom/cv_xif/cvxif_add_nop.S +++ b/verif/tests/custom/cv_xif/cvxif_add_nop.S @@ -21,16 +21,36 @@ main: # core of the test +start0: LOAD_RS(a0, 0x332211); LOAD_RS(a1, 0xDEADBEEF); LOAD_RS(a2, 0xDEADBEEF); + CUS_NOP(); + CUS_ADD(01010, 01010, 01011); + CUS_ADD(01011, 01010, 01011); + CUS_ADD(01010, 01011, 01011); + lw a0, num1; + CUS_ADD_RS1(01100,01010,01011); + lw a1, num2; + CUS_ADD(01010,01011,00000); - CUS_ADD(01010,01010,01010,01011); - CUS_NOP(00000,00000,00000,00000); - CUS_NOP(00000,00000,00000,00000); - CUS_ADD(01010,01010,01010,01011); - CUS_NOP(00000,00000,00000,00000); - CUS_ADD(01010,01010,01010,01011); +# Take branch to check for instruction kill + li a0, 0xCAFE; + li a1, 0xCAFE; + xor a2, a0, a1; + beqz a2, branch2; + +branch1: + CUS_ADD(00000, 00000, 01011); + CUS_NOP(); + CUS_NOP(); + +branch2: + CUS_ADD(01010, 01010, 01011); + lw a0, num1; + CUS_ADD_RS1(01100,01010,01011); + lw a1, num2; + CUS_ADD(01010,01011,00000); # (example of) final self-check test li a0, 0xCAFE; @@ -47,3 +67,8 @@ pass: # Success post-processing (messages, ecall setup etc.) li a0, 0x0; jal exit; + +.data + num1: .word 5 // First number + num2: .word 7 // Second number + result: .word 12 // Result \ No newline at end of file diff --git a/verif/tests/custom/cv_xif/cvxif_full.S b/verif/tests/custom/cv_xif/cvxif_full.S new file mode 100644 index 0000000000..c0e64d2178 --- /dev/null +++ b/verif/tests/custom/cv_xif/cvxif_full.S @@ -0,0 +1,104 @@ +# See LICENSE for license details. + +#***************************************************************************** +# Copyright 2022 Thales DIS design services SAS +# +# Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# SPDX-License-Identifier: Apache-2.0 WITH SHL-2.0 +# You may obtain a copy of the License at https:#solderpad.org/licenses/ +# +#---------------------------------------------------------------------------------- + +#include "cvxif_macros.h" + + #------------------------------------------------------------- + # Custom tests + #------------------------------------------------------------- + + .globl main +main: + +# core of the test + +start0: + LOAD_RS(a0, 0x332211); + LOAD_RS(a1, 0xDEADBEEF); + LOAD_RS(a2, 0xDEADBEEF); + CUS_NOP(); + CUS_ADD(01010, 01010, 01011); + CUS_ADD(01011, 01010, 01011); + CUS_ADD(01010, 01011, 01011); + lw a0, num1; + CUS_ADD_RS1(01000,01010,01011); + lw a1, num2; + CUS_ADD(01010,01011,00000); + +# R4-Type RS3 Instructions + LOAD_RS(a0, 0x111111); + LOAD_RS(a1, 0x222222); + LOAD_RS(a2, 0x333333); + CUS_ADD_RS3_MADD(01010,01011,01100,01101); + CUS_ADD_RS3_MSUB(01010,01011,01100,01101); + CUS_ADD_RS3_NMADD(01010,01011,01100,01101); + CUS_ADD_RS3_NMSUB(01010,01011,01100,01101); + +# R-type RS3 Instruction + LOAD_RS(a0, 0x111111); + LOAD_RS(a1, 0x222222); + LOAD_RS(a2, 0x333333); + CUS_ADD_RS3_RTYPE(01010,01011,01100); # --> result in 0b01010 + CUS_ADD_RS3_RTYPE(01010,01011,01100); # --> result in 0b01010 + CUS_ADD_RS3_RTYPE(01010,01011,01100); # --> result in 0b01010 + +# Take branch to check for instruction kill + li a0, 0xCAFE; + li a1, 0xCAFE; + xor a2, a0, a1; + beqz a2, branch2; + +branch1: + CUS_ADD(00000, 00000, 01011); + CUS_NOP(); + CUS_NOP(); + +branch2: + CUS_ADD(01010, 01010, 01011); + lw a0, num1; + CUS_ADD_RS1(01000,01010,01011); + lw a1, num2; + CUS_ADD(01010,01011,00000); + +# Compressed instruction leads to error in spike + LOAD_RS(a0, 0x111111); + LOAD_RS(a1, 0x222222); + LOAD_RS(a2, 0x333333); + CUS_CNOP(); + CUS_CADD(01011,01010); + xor a1, a0, a2; + beqz a2, pass; + +# Should raise an exception +# .half 0x0000; +# .word 0xFFFFFFFF; + +# (example of) final self-check test +# li a0, 0xCAFE; +# li a1, 0xCAFE; +# xor a2, a0, a1; +# beqz a2, pass; + +fail: + # Failure post-processing (messages, ecall setup etc.) + li a0, 0x0; + jal exit; + +pass: + # Success post-processing (messages, ecall setup etc.) + li a0, 0x0; + jal exit; + +.data + num1: .word 5 // First number + num2: .word 7 // Second number + result: .word 12 // Result \ No newline at end of file diff --git a/verif/tests/custom/cv_xif/cvxif_macros.h b/verif/tests/custom/cv_xif/cvxif_macros.h index f7e4448ce8..be2a692c26 100644 --- a/verif/tests/custom/cv_xif/cvxif_macros.h +++ b/verif/tests/custom/cv_xif/cvxif_macros.h @@ -6,14 +6,23 @@ // You may obtain a copy of the License at https://solderpad.org/licenses/ // // Original Author: Zineb EL KACIMI (zineb.el-kacimi@external.thalesgroup.com) +// Contributor : Guillaume Chauvon #define LOAD_RS(rs,value) li rs, value #define COMP_RS(rs1,rs2,rd) xor rd, rs1, rs2 + +#define CUS_NOP() .word 0b##0000000##00000####00000##000##00000##1111011 #define CUS_ADD(rs1,rs2,rd) .word 0b##0000000##rs2####rs1##001##rd##1111011 -#define CUS_NOP(rs1,rs2,rd) .word 0b##0000000##00000####00000##000##00000##1111011 -#define CUS_ADD_RS3(rs1,rs2,rs3,rd) .word 0b##rs3##01##rs2####rs1##000##rd##1111011 -#define CUS_ADD_MULTI(rs1,rs2,rd) .word 0b##0001000##rs2####rs1##000##rd##1111011 -#define CUS_EXC(rs1,rs2,rs3,rd) .word 0b####1100000##rs2####rs1##010##rd##1111011 -#define CUS_U_ADD(rs1,rs2,rd) .word 0b####0000010##rs2####rs1##000##rd##1111011 -#define CUS_S_ADD(rs1,rs2,rd) .word 0b####0000110##rs2####rs1##000##rd##1111011 +#define CUS_ADD_RS1(rs1,rs2,rd) .word 0b##0000001##rs2####rs1##001##rd##1111011 // only use rs1 : rs1 + rs1 => rd +#define CUS_ADD_RS2(rs1,rs2,rd) .word 0b##0000010##rs2####rs1##001##rd##1111011 // only use rs2 : rs2 + rs2 => rd +#define CUS_ADD_MULTI(rs1,rs2,rd) .word 0b##0000011##rs2####rs1##001##rd##1111011 + +#define CUS_ADD_RS3_MADD(rs1,rs2,rs3,rd) .word 0b##rs3##00##rs2####rs1##000##rd##1000011 //MADD +#define CUS_ADD_RS3_MSUB(rs1,rs2,rs3,rd) .word 0b##rs3##00##rs2####rs1##000##rd##1000111 //MSUB +#define CUS_ADD_RS3_NMSUB(rs1,rs2,rs3,rd) .word 0b##rs3##00##rs2####rs1##000##rd##1001011 //NMSUB +#define CUS_ADD_RS3_NMADD(rs1,rs2,rs3,rd) .word 0b##rs3##00##rs2####rs1##000##rd##1001111 //NMADD +#define CUS_ADD_RS3_RTYPE(rs1,rs2,rs3) .word 0b##0000100##rs2####rs1##001##rs3##1111011 + +#define CUS_CADD(rs1, rs2) .half 0b##111##1##rs1##rs2##00 // -> Extend to CUS_ADD(rs1,rs2,x10) +#define CUS_CNOP() .half 0b##111##0##00000##00000##00 // -> Extend to CUS_NOP diff --git a/verif/tests/custom/debug_test/bsp/.gitignore b/verif/tests/custom/debug_test/bsp/.gitignore new file mode 100644 index 0000000000..c0a1f349c4 --- /dev/null +++ b/verif/tests/custom/debug_test/bsp/.gitignore @@ -0,0 +1 @@ +libcv-verif.a diff --git a/verif/tests/custom/debug_test/bsp/Makefile b/verif/tests/custom/debug_test/bsp/Makefile new file mode 100644 index 0000000000..b34ad75cea --- /dev/null +++ b/verif/tests/custom/debug_test/bsp/Makefile @@ -0,0 +1,36 @@ +CV_SW_TOOLCHAIN ?= /opt/riscv +RISCV ?= $(CV_SW_TOOLCHAIN) +RISCV_EXE_PREFIX ?= $(RISCV)/bin/riscv32-unknown-elf- +RISCV_CC ?= gcc +RISCV_GCC = $(RISCV_EXE_PREFIX)$(RISCV_CC) +RISCV_AR = $(RISCV_EXE_PREFIX)ar +RISCV_MARCH ?= rv32im_zca +SRC = crt0.S handlers.S syscalls.c vectors.S +OBJ = crt0.o handlers.o syscalls.o vectors.o +LIBCV-VERIF = libcv-verif.a +CFLAGS ?= -Os -g -static -mabi=ilp32 -march=$(RISCV_MARCH) -Wall -pedantic -mno-relax $(RISCV_CFLAGS) + +all: $(LIBCV-VERIF) + +$(LIBCV-VERIF): $(OBJ) + $(RISCV_AR) rcs $@ $(OBJ) + +%.o : %.c + $(RISCV_GCC) $(CFLAGS) -c $< -o $@ + +%.o : %.S + $(RISCV_GCC) $(CFLAGS) -c $< -o $@ + +clean: + rm -f $(OBJ) $(LIBCV-VERIF) + + +vars: + @echo "make bsp variables:" + @echo " CV_SW_TOOLCHAIN = $(CV_SW_TOOLCHAIN)" + @echo " RISCV = $(RISCV)" + @echo " RISCV_EXE_PREFIX = $(RISCV_EXE_PREFIX)" + @echo " RISCV_GCC = $(RISCV_GCC)" + @echo " RISCV_MARCH = $(RISCV_MARCH)" + @echo " RISCV_CFLAGS = $(RISCV_CFLAGS)" + @echo " CFLAGS = $(CFLAGS)" diff --git a/verif/tests/custom/debug_test/bsp/README.md b/verif/tests/custom/debug_test/bsp/README.md new file mode 100644 index 0000000000..2737d94c3f --- /dev/null +++ b/verif/tests/custom/debug_test/bsp/README.md @@ -0,0 +1,166 @@ +Board Support Package (BSP) for CVA6 Verification +================================================= + +This BSP provides the code to support running programs on the CVA6 verification +target. It performs initialization tasks (`crt0.S`), handles +interrupts/exceptions (`vectors.S`, `handlers.S`), provides syscall +implementations (`syscalls.c`) and includes a linker script (`link.ld`) to +control the placement of sections in the binary. + +Each file is described in more detail below followed by instructions for +building and using the BSP. + +C Runtime Initialization +------------------------ + +The C Runtime file `crt0.S` provides the `_start` function which is the entry +point of the program and performs the following tasks: + * Initialize global and stack pointer. + * Store the address of `vector_table` in `mtvec`, setting the lower two bits + to `0x2` to select vectored interrupt mode. + * Zero the BSS section. + * Invoke initialization of C constructors and set destructors to be called on + exit. + * Zero `argc` and `argv` (the stack is not initialized, so these are zeroed + to prevent uninitialized values causing a mismatch against the reference + result). + * Call `main`. + * If `main` returns, call `exit` with its return code. + +Interrupt and Exception Handling +-------------------------------- + +When a RISC-V core traps on an interrupt/exception, the `pc` is stored in `mepc` +and the reason for the trap is stored in `mcause`. The `MSB` of `mcause` +is set to `0` for an exception and `1` for an interrupt; the remaining bits +`mcause[MXLEN-2:0]` contain the exception code. The table of `mcause` values is +defined in Table 3.6 of the [RISC-V Instruction Set Manual Volume II: Privileged +Architecture Version 20190608-Priv-MSU-Ratified](https://github.com/riscv/riscv-isa-manual/releases/download/Ratified-IMFDQC-and-Priv-v1.11/riscv-privileged-20190608.pdf). + +The core jumps to a location in the vector table according to the `BASE` address +of the vector table stored in `mtvec` and the value of the exception code in +`mcause`. In vectored mode, all exceptions jump to `BASE` and interrupts jump to +`BASE+4*mcause[XLEN-2:0]`. Note that because user software interrupts have +exception code `0`, they jump to the same location as exceptions, therefore the +user software interrupt handler must also handle exceptions. + +The vector table is defined in `vectors.S` and may jump to one of the +following interrupt request handlers in `handlers.S`: + * `u_sw_irq_handler` - handles user software interrupts and all exceptions. + Saves all caller saved registers then checks `mcause` and jumps to the + appropriate handler as follows: + - Breakpoint: jump to `handle_ebreak`. + - Illegal instruction: jump to `handle_illegal`. + - Environment call from M-mode: jump to `handle_ecall`. + - Any other exception or user software interrupt: jump to `handle_unknown`. + * `m_software_irq_handler` - handles machine-mode software interrupts + - Currently jumps to `__no_irq_handler`. Behavior to be defined in future commit. + * `m_timer_irq_handler` - handles machine-mode timer interrupts + - Currently jumps to `__no_irq_handler`. Behavior to be defined in future commit. + * `m_external_irq_handler` - handles machine-mode external interrupts + - Currently jumps to `__no_irq_handler`. Behavior to be defined in future commit. + * `m_fast0_irq_handler` - handles machine-mode fast external interrupts (platform extension for CV32) + - Currently jumps to `__no_irq_handler`. Behavior to be defined in future commit. + * `m_fast1_irq_handler` - handles machine-mode fast external interrupts (platform extension for CV32) + - Currently jumps to `__no_irq_handler`. Behavior to be defined in future commit. + * `m_fast2_irq_handler` - handles machine-mode fast external interrupts (platform extension for CV32) + - Currently jumps to `__no_irq_handler`. Behavior to be defined in future commit. + * `m_fast3_irq_handler` - handles machine-mode fast external interrupts (platform extension for CV32) + - Currently jumps to `__no_irq_handler`. Behavior to be defined in future commit. + * `m_fast4_irq_handler` - handles machine-mode fast external interrupts (platform extension for CV32) + - Currently jumps to `__no_irq_handler`. Behavior to be defined in future commit. + * `m_fast5_irq_handler` - handles machine-mode fast external interrupts (platform extension for CV32) + - Currently jumps to `__no_irq_handler`. Behavior to be defined in future commit. + * `m_fast6_irq_handler` - handles machine-mode fast external interrupts (platform extension for CV32) + - Currently jumps to `__no_irq_handler`. Behavior to be defined in future commit. + * `m_fast7_irq_handler` - handles machine-mode fast external interrupts (platform extension for CV32) + - Currently jumps to `__no_irq_handler`. Behavior to be defined in future commit. + * `m_fast8_irq_handler` - handles machine-mode fast external interrupts (platform extension for CV32) + - Currently jumps to `__no_irq_handler`. Behavior to be defined in future commit. + * `m_fast9_irq_handler` - handles machine-mode fast external interrupts (platform extension for CV32) + - Currently jumps to `__no_irq_handler`. Behavior to be defined in future commit. + * `m_fast10_irq_handler` - handles machine-mode fast external interrupts (platform extension for CV32) + - Currently jumps to `__no_irq_handler`. Behavior to be defined in future commit. + * `m_fast11_irq_handler` - handles machine-mode fast external interrupts (platform extension for CV32) + - Currently jumps to `__no_irq_handler`. Behavior to be defined in future commit. + * `m_fast12_irq_handler` - handles machine-mode fast external interrupts (platform extension for CV32) + - Currently jumps to `__no_irq_handler`. Behavior to be defined in future commit. + * `m_fast13_irq_handler` - handles machine-mode fast external interrupts (platform extension for CV32) + - Currently jumps to `__no_irq_handler`. Behavior to be defined in future commit. + * `m_fast14_irq_handler` - handles machine-mode fast external interrupts (platform extension for CV32) + - Currently jumps to `__no_irq_handler`. Behavior to be defined in future commit. + * `m_fast15_irq_handler` - handles machine-mode fast external interrupts (platform extension for CV32) + - Currently jumps to `__no_irq_handler`. Behavior to be defined in future commit. + * `__no_irq_handler` - loops printing "no exception handler installed". + +The following exception handlers may be called from `u_sw_irq_handler`: + * `handle_ecall` - calls `handle_syscall` which checks the syscall number and + calls the corresponding syscall function. + * `handle_ebreak` - currently just prints "ebreak exception handler entered" + * `handle_illegal_insn` - prints "illegal instruction exception handler + entered" + * `unknown_handler` - called when there is no handler for the interrupt/ + exception. This is the only case where `mepc` is not incremented, because we + do not know the appropiate action to take. + +Returning from the `u_sw_irq_handler`. All handlers called by `u_sw_irq_handler` +increment `mepc` before calling `mret`, except for `unknown_handler`. Handlers +that require `mepc` to be incremented jump to `end_handler_incr_mepc` otherwise +they jump to `end_handler_ret`. All caller saved registers are restored before +finally calling `mret`. + +Some test cases require the ability to override the default handlers. In future, +these handlers will be made overridable by defining their labels as `.weak` +symbols. Test cases can then provide their own handlers where necessary. + +System Calls +------------ + +On a bare-metal system there is no OS to handle system calls, therefore, we +define our own system calls in `syscalls.c`. For example, the implementation of +`_write` outputs a byte at a time to the virtual printer peripheral. Many of the +functions provide minimal implementations that simply fail gracefully due to +lack of necessary OS support e.g. no file system. + +The [RISC-V Instruction Set Manual Volume I: Unprivileged ISA Version 20191213]( +https://content.riscv.org/wp-content/uploads/2019/06/riscv-spec.pdf) states that +for an `ecall` the "ABI for the system will define how parameters for the +environment request are passed". This BSP follows the convention used for RISC-V +in `newlib`. Parameters are passed in registers `a0` to `a5` and system call ID +in `a7` (`t0` on RV32E). When handling an `ecall`, `handle_ecall` calls +`handle_syscall` which then calls the appropriate function that implements the +system call, passing parameters as necessary. + +Linker Script +------------- + +The linker script defines the memory layout and controls the mapping of input +sections from object files to output sections in the output binary. + +The `link.ld` script is based on the standard upstream RV32 linker script, with +some changes required for CVA6: + * Memory layout is defined as follows: + * `ram` start=0x0, length=4MB + * `dbg` start=0x1A110800, length=2KB + * Changes to output section placement are as follows: + - `.vectors` start=ORIGIN(`ram`) + - `.init` start=0x80 + - `.heap` starts at end of data and grows upwards + - `.stack` starts at the end of `ram` and grows downwards + - `.debugger` start=ORIGIN(`dbg`) + - `.debugger_exception` start=0x1A110C00 + - `.debugger_stack` follows `.debugger_exception` + +Building and using the BSP Library +---------------------------------- + +The BSP can be built in this directory as follows: +``` +make +``` +This produces libcv-verif.a which can then be linked with a test program as +follows: + +``` +gcc test-program.c -nostartfiles -T/path/to/bsp/link.ld -L/path/to/bsp/ -lcv-verif +``` diff --git a/verif/tests/custom/debug_test/bsp/bsp.h b/verif/tests/custom/debug_test/bsp/bsp.h new file mode 100644 index 0000000000..ca3035146f --- /dev/null +++ b/verif/tests/custom/debug_test/bsp/bsp.h @@ -0,0 +1,184 @@ +// Copyright 2022 Silicon Laboratories Inc. +// +// SPDX-License-Identifier: Apache-2.0 WITH SHL-2.1 +// +// Licensed under the Solderpad Hardware License v 2.1 (the "License"); you +// may not use this file except in compliance with the License, or, at your +// option, the Apache License version 2.0. +// +// You may obtain a copy of the License at +// https://solderpad.org/licenses/SHL-2.1/ +// +// Unless required by applicable law or agreed to in writing, any work +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// +// See the License for the specific language governing permissions and +// limitations under the License. + + +enum { + EXC_CAUSE_INSTR_ACC_FAULT = 1, + EXC_CAUSE_ILLEGAL_INSTR = 2, + EXC_CAUSE_BREAKPOINT = 3, + EXC_CAUSE_LOAD_ACC_FAULT = 5, + EXC_CAUSE_STORE_ACC_FAULT = 7, + EXC_CAUSE_ENV_CALL_U = 8, + EXC_CAUSE_ENV_CALL_M = 11, + EXC_CAUSE_INSTR_BUS_FAULT = 24, + EXC_CAUSE_INSTR_INTEGRITY_FAULT = 25, +}; + +typedef enum { + PMPMODE_OFF = 0, + PMPMODE_TOR = 1, + PMPMODE_NA4 = 2, + PMPMODE_NAPOT = 3 +} pmp_mode_t; + +// Verbosity levels (Akin to the uvm verbosity concept) +typedef enum { + V_OFF = 0, + V_LOW = 1, + V_MEDIUM = 2, + V_HIGH = 3, + V_DEBUG = 4 +} verbosity_t; + +// Matches funct3 values for CSR instructions +typedef enum { + CSRRW = 1, + CSRRS = 2, + CSRRC = 3, + CSRRWI = 5, + CSRRSI = 6, + CSRRCI = 7 +} csr_instr_access_t; + +typedef union { + struct { + volatile uint32_t opcode : 7; + volatile uint32_t rd : 5; + volatile uint32_t funct3 : 3; + volatile uint32_t rs1_uimm : 5; + volatile uint32_t csr : 12; + } volatile fields; + volatile uint32_t raw; +} __attribute__((packed)) csr_instr_t; + +typedef union { + struct { + volatile uint32_t load : 1; + volatile uint32_t store : 1; + volatile uint32_t execute : 1; + volatile uint32_t u : 1; + volatile uint32_t s : 1; + volatile uint32_t res_5_5 : 1; + volatile uint32_t m : 1; + volatile uint32_t match : 4; + volatile uint32_t chain : 1; + volatile uint32_t action : 4; + volatile uint32_t size : 4; + volatile uint32_t timing : 1; + volatile uint32_t select : 1; + volatile uint32_t hit : 1; + volatile uint32_t vu : 1; + volatile uint32_t vs : 1; + volatile uint32_t res_26_25: 2; + volatile uint32_t dmode : 1; + volatile uint32_t type : 4; + } __attribute__((packed)) volatile fields; + volatile uint32_t raw; +} __attribute__((packed)) mcontrol6_t; + +typedef union { + struct { + volatile uint32_t uie : 1; // 0 + volatile uint32_t sie : 1; // 1 + volatile uint32_t wpri : 1; // 2 + volatile uint32_t mie : 1; // 3 + volatile uint32_t upie : 1; // 4 + volatile uint32_t spie : 1; // 5 + volatile uint32_t wpri0 : 1; // 6 + volatile uint32_t mpie : 1; // 7 + volatile uint32_t spp : 1; // 8 + volatile uint32_t wpri1 : 2; // 10: 9 + volatile uint32_t mpp : 2; // 12:11 + volatile uint32_t fs : 2; // 14:13 + volatile uint32_t xs : 2; // 16:15 + volatile uint32_t mprv : 1; // 17 + volatile uint32_t sum : 1; // 18 + volatile uint32_t mxr : 1; // 19 + volatile uint32_t tvm : 1; // 20 + volatile uint32_t tw : 1; // 21 + volatile uint32_t tsr : 1; // 22 + volatile uint32_t wpri3 : 8; // 30:23 + volatile uint32_t sd : 1; // 31 + } volatile fields; + volatile uint32_t raw; +} __attribute__((packed)) mstatus_t; + +typedef union { + struct { + volatile uint32_t mml : 1; + volatile uint32_t mmwp : 1; + volatile uint32_t rlb : 1; + volatile uint32_t reserved_31_3 : 29; + } __attribute__((packed)) volatile fields; + volatile uint32_t raw : 32; +} __attribute__((packed)) mseccfg_t; + +typedef union { + struct { + volatile uint32_t reserved_1_0 : 2; + volatile uint32_t jvt_access : 1; + volatile uint32_t reserved_31_3 : 29; + } __attribute__((packed)) volatile fields; + volatile uint32_t raw : 32; +} __attribute__((packed)) mstateen0_t; + +typedef union { + struct { + volatile uint32_t r : 1; + volatile uint32_t w : 1; + volatile uint32_t x : 1; + volatile uint32_t a : 2; + volatile uint32_t reserved_6_5 : 2; + volatile uint32_t l : 1; + } __attribute__((packed)) volatile fields; + volatile uint32_t raw : 8; +} __attribute__((packed)) pmpsubcfg_t; + +typedef union { + struct { + volatile uint32_t cfg : 8; + } __attribute__((packed)) volatile reg_idx[4]; + volatile uint32_t raw : 32; +} __attribute__((packed)) pmpcfg_t; + +typedef union { + struct { + volatile uint32_t mode : 6; + volatile uint32_t base : 26; + } __attribute__((packed)) volatile fields; + volatile uint32_t raw : 32; +} __attribute__((packed)) jvt_t; + +typedef union { + struct { + volatile uint32_t exccode : 12; + volatile uint32_t res_30_12 : 19; + volatile uint32_t interrupt : 1; + } __attribute__((packed)) volatile clint; + struct { + volatile uint32_t exccode : 12; + volatile uint32_t res_15_12 : 4; + volatile uint32_t mpil : 8; + volatile uint32_t res_26_24 : 3; + volatile uint32_t mpie : 1; + volatile uint32_t mpp : 2; + volatile uint32_t minhv : 1; + volatile uint32_t interrupt : 1; + } __attribute__((packed)) volatile clic; + volatile uint32_t raw : 32; +} __attribute__((packed)) mcause_t; diff --git a/verif/tests/custom/debug_test/bsp/corev_uvmt.h b/verif/tests/custom/debug_test/bsp/corev_uvmt.h new file mode 100644 index 0000000000..6241b71ad6 --- /dev/null +++ b/verif/tests/custom/debug_test/bsp/corev_uvmt.h @@ -0,0 +1,76 @@ +#ifndef __COREV_UVMT_H__ +#define __COREV_UVMT_H__ + +/* +** +** Copyright 2021 OpenHW Group +** Copyright 2021 Silicon Labs +** +** Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); +** you may not use this file except in compliance with the License. +** You may obtain a copy of the License at +** +** https://solderpad.org/licenses/ +** +** Unless required by applicable law or agreed to in writing, software +** distributed under the License is distributed on an "AS IS" BASIS, +** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +** See the License for the specific language governing permissions and +** limitations under the License. +** +******************************************************************************* +** CORE-V UVM Testbench (UVMT) defines +******************************************************************************* +*/ + +#define CV_VP_REGISTER_BASE 0x80800000 + +#define CV_VP_VIRTUAL_PRINTER_OFFSET 0x00000000 +#define CV_VP_RANDOM_NUM_OFFSET 0x00000040 +#define CV_VP_CYCLE_COUNTER_OFFSET 0x00000080 +#define CV_VP_STATUS_FLAGS_OFFSET 0x000000c0 +#define CV_VP_FENCEI_TAMPER_OFFSET 0x00000100 +#define CV_VP_INTR_TIMER_OFFSET 0x00000140 +#define CV_VP_DEBUG_CONTROL_OFFSET 0x00000180 +#define CV_VP_OBI_SLV_RESP_OFFSET 0x000001c0 +#define CV_VP_SIG_WRITER_OFFSET 0x00000200 +#define CV_VP_OBI_ERR_AWAIT_GOAHEAD_OFFSET 0x00000240 + +#define CV_VP_CYCLE_COUNTER_BASE (CV_VP_REGISTER_BASE + CV_VP_CYCLE_COUNTER_OFFSET) +#define CV_VP_DEBUG_CONTROL_BASE (CV_VP_REGISTER_BASE + CV_VP_DEBUG_CONTROL_OFFSET) +#define CV_VP_FENCEI_TAMPER_BASE (CV_VP_REGISTER_BASE + CV_VP_FENCEI_TAMPER_OFFSET) +#define CV_VP_INTR_TIMER_BASE (CV_VP_REGISTER_BASE + CV_VP_INTR_TIMER_OFFSET) +#define CV_VP_OBI_ERR_AWAIT_GOAHEAD_BASE (CV_VP_REGISTER_BASE + CV_VP_OBI_ERR_AWAIT_GOAHEAD_OFFSET) +#define CV_VP_OBI_SLV_RESP_BASE (CV_VP_REGISTER_BASE + CV_VP_OBI_SLV_RESP_OFFSET) +#define CV_VP_RANDOM_NUM_BASE (CV_VP_REGISTER_BASE + CV_VP_RANDOM_NUM_OFFSET) +#define CV_VP_SIG_WRITER_BASE (CV_VP_REGISTER_BASE + CV_VP_SIG_WRITER_OFFSET) +#define CV_VP_STATUS_FLAGS_BASE (CV_VP_REGISTER_BASE + CV_VP_STATUS_FLAGS_OFFSET) +#define CV_VP_VIRTUAL_PRINTER_BASE (CV_VP_REGISTER_BASE + CV_VP_VIRTUAL_PRINTER_OFFSET) + +// -------------------------------------------------------------------------- +// Registers inside the OBI_SLV_RESP VP +// -------------------------------------------------------------------------- +#define CV_VP_OBI_SLV_RESP_I_ERR_ADDR_MIN ((volatile uint32_t*) (CV_VP_OBI_SLV_RESP_BASE + 4*0)) +#define CV_VP_OBI_SLV_RESP_I_ERR_ADDR_MAX ((volatile uint32_t*) (CV_VP_OBI_SLV_RESP_BASE + 4*1)) +#define CV_VP_OBI_SLV_RESP_I_ERR_VALID ((volatile uint32_t*) (CV_VP_OBI_SLV_RESP_BASE + 4*2)) +#define CV_VP_OBI_SLV_RESP_I_EXOKAY_ADDR_MIN ((volatile uint32_t*) (CV_VP_OBI_SLV_RESP_BASE + 4*3)) +#define CV_VP_OBI_SLV_RESP_I_EXOKAY_ADDR_MAX ((volatile uint32_t*) (CV_VP_OBI_SLV_RESP_BASE + 4*4)) +#define CV_VP_OBI_SLV_RESP_I_EXOKAY_VALID ((volatile uint32_t*) (CV_VP_OBI_SLV_RESP_BASE + 4*5)) + +#define CV_VP_OBI_SLV_RESP_D_ERR_ADDR_MIN ((volatile uint32_t*) (CV_VP_OBI_SLV_RESP_BASE + 6*4 + 4*0)) +#define CV_VP_OBI_SLV_RESP_D_ERR_ADDR_MAX ((volatile uint32_t*) (CV_VP_OBI_SLV_RESP_BASE + 6*4 + 4*1)) +#define CV_VP_OBI_SLV_RESP_D_ERR_VALID ((volatile uint32_t*) (CV_VP_OBI_SLV_RESP_BASE + 6*4 + 4*2)) +#define CV_VP_OBI_SLV_RESP_D_EXOKAY_ADDR_MIN ((volatile uint32_t*) (CV_VP_OBI_SLV_RESP_BASE + 6*4 + 4*3)) +#define CV_VP_OBI_SLV_RESP_D_EXOKAY_ADDR_MAX ((volatile uint32_t*) (CV_VP_OBI_SLV_RESP_BASE + 6*4 + 4*4)) +#define CV_VP_OBI_SLV_RESP_D_EXOKAY_VALID ((volatile uint32_t*) (CV_VP_OBI_SLV_RESP_BASE + 6*4 + 4*5)) + +// API for Debug Control VP register +#define CV_VP_DEBUG_CONTROL_DBG_REQ(i) ((i) << 31) +#define CV_VP_DEBUG_CONTROL_REQ_MODE(i) ((i) << 30) +#define CV_VP_DEBUG_CONTROL_RAND_PULSE_DURATION(i) ((i) << 29) +#define CV_VP_DEBUG_CONTROL_PULSE_DURATION(i) ((i) << 16) +#define CV_VP_DEBUG_CONTROL_RAND_START_DELAY(i) ((i) << 15) +#define CV_VP_DEBUG_CONTROL_START_DELAY(i) ((i) << 0) +#define CV_VP_DEBUG_CONTROL *((volatile uint32_t * volatile) (CV_VP_DEBUG_CONTROL_BASE)) + +#endif diff --git a/verif/tests/custom/debug_test/bsp/crt0.S b/verif/tests/custom/debug_test/bsp/crt0.S new file mode 100644 index 0000000000..e957d78e72 --- /dev/null +++ b/verif/tests/custom/debug_test/bsp/crt0.S @@ -0,0 +1,82 @@ +/* Copyright (c) 2017 SiFive Inc. All rights reserved. + * Copyright (c) 2019 ETH Zürich and University of Bologna + * Copyright (c) 2023 Silicon Laboratories Inc. + * This copyrighted material is made available to anyone wishing to use, + * modify, copy, or redistribute it subject to the terms and conditions + * of the FreeBSD License. This program is distributed in the hope that + * it will be useful, but WITHOUT ANY WARRANTY expressed or implied, + * including the implied warranties of MERCHANTABILITY or FITNESS FOR + * A PARTICULAR PURPOSE. A copy of this license is available at + * http://www.opensource.org/licenses. + */ +/* Make sure the vector table gets linked into the binary. */ +.global vector_table + +/* Make sure the NMI handler gets linked into the binary. */ +.global nmi_handler + +/* Entry point for bare metal programs */ +.section .text.start +.global _start +.type _start, @function + +_start: +/* initialize global pointer */ +.option push +.option norelax +1:auipc gp, %pcrel_hi(__global_pointer$) + addi gp, gp, %pcrel_lo(1b) + +// /* initialize vector table pointer */ +// 1:auipc a0, %pcrel_hi(__jvt_base$) +// addi a0, a0, %pcrel_lo(1b) +// csrw jvt, a0 +.option pop + +/* initialize stack pointer */ + la sp, __stack_start + +/* set vector table address */ + la a0, __vector_start + ori a0, a0, 1 /*vector mode = vectored */ + csrw mtvec, a0 + +/* clear the bss segment */ + la a0, _edata + la a2, _end + sub a2, a2, a0 + li a1, 0 + call memset + +/* new-style constructors and destructors */ + la a0, __libc_fini_array + call atexit + call __libc_init_array + +/* call main */ +// lw a0, 0(sp) /* a0 = argc */ +// addi a1, sp, __SIZEOF_POINTER__ /* a1 = argv */ +// li a2, 0 /* a2 = envp = NULL */ +// Initialize these variables to 0. Cannot use argc or argv +// since the stack is not initialized + li a0, 0 + li a1, 0 + li a2, 0 + + call main + tail exit + +.size _start, .-_start + + +.global _init +.type _init, @function +.global _fini +.type _fini, @function +_init: +_fini: + /* These don't have to do anything since we use init_array/fini_array. Prevent + missing symbol error */ + ret +.size _init, .-_init +.size _fini, .-_fini diff --git a/verif/tests/custom/debug_test/bsp/handlers.S b/verif/tests/custom/debug_test/bsp/handlers.S new file mode 100644 index 0000000000..7509a98aa2 --- /dev/null +++ b/verif/tests/custom/debug_test/bsp/handlers.S @@ -0,0 +1,345 @@ +/* +* Copyright 2019 ETH Zürich and University of Bologna +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +/* Exception codes */ +#define EXCEPTION_INSN_ACCESS_FAULT 1 +#define EXCEPTION_ILLEGAL_INSN 2 +#define EXCEPTION_BREAKPOINT 3 +#define EXCEPTION_LOAD_ACCESS_FAULT 5 +#define EXCEPTION_STORE_ACCESS_FAULT 7 +#define EXCEPTION_ECALL_M 11 +#define EXCEPTION_ECALL_U 8 +#define EXCEPTION_INSN_BUS_FAULT 24 + +/* NMI interrupt codes */ +#define INTERRUPT_LOAD_BUS_FAULT (1024 | (0x1 << 31)) +#define INTERRUPT_STORE_BUS_FAULT (1025 | (0x1 << 31)) + +.section .text.handlers +.global __no_irq_handler +.global u_sw_irq_handler +.global m_software_irq_handler +.global m_timer_irq_handler +.global m_external_irq_handler +.global m_fast0_irq_handler +.global m_fast1_irq_handler +.global m_fast2_irq_handler +.global m_fast3_irq_handler +.global m_fast4_irq_handler +.global m_fast5_irq_handler +.global m_fast6_irq_handler +.global m_fast7_irq_handler +.global m_fast8_irq_handler +.global m_fast9_irq_handler +.global m_fast10_irq_handler +.global m_fast11_irq_handler +.global m_fast12_irq_handler +.global m_fast13_irq_handler +.global m_fast14_irq_handler +.global m_fast15_irq_handler +.global end_handler_incr_mepc +.global end_handler_ret + +.weak __no_irq_handler +.weak u_sw_irq_handler +.weak m_software_irq_handler +.weak m_timer_irq_handler +.weak m_external_irq_handler +.weak m_fast0_irq_handler +.weak m_fast1_irq_handler +.weak m_fast2_irq_handler +.weak m_fast3_irq_handler +.weak m_fast4_irq_handler +.weak m_fast5_irq_handler +.weak m_fast6_irq_handler +.weak m_fast7_irq_handler +.weak m_fast8_irq_handler +.weak m_fast9_irq_handler +.weak m_fast10_irq_handler +.weak m_fast11_irq_handler +.weak m_fast12_irq_handler +.weak m_fast13_irq_handler +.weak m_fast14_irq_handler +.weak m_fast15_irq_handler + +.weak handle_illegal_insn +.weak handle_insn_access_fault +.weak handle_insn_bus_fault +.weak handle_ecall +.weak handle_ecall_u + +/* exception handling */ +__no_irq_handler: + la a0, no_exception_handler_msg + jal ra, puts + j __no_irq_handler + +m_software_irq_handler: + j __no_irq_handler + +m_timer_irq_handler: + j __no_irq_handler + +m_external_irq_handler: + j __no_irq_handler + +m_fast0_irq_handler: + j __no_irq_handler + +m_fast1_irq_handler: + j __no_irq_handler + +m_fast2_irq_handler: + j __no_irq_handler + +m_fast3_irq_handler: + j __no_irq_handler + +m_fast4_irq_handler: + j __no_irq_handler + +m_fast5_irq_handler: + j __no_irq_handler + +m_fast6_irq_handler: + j __no_irq_handler + +m_fast7_irq_handler: + j __no_irq_handler + +m_fast8_irq_handler: + j __no_irq_handler + +m_fast9_irq_handler: + j __no_irq_handler + +m_fast10_irq_handler: + j __no_irq_handler + +m_fast11_irq_handler: + j __no_irq_handler + +m_fast12_irq_handler: + j __no_irq_handler + +m_fast13_irq_handler: + j __no_irq_handler + +m_fast14_irq_handler: + j __no_irq_handler + +m_fast15_irq_handler: + j __no_irq_handler + +u_sw_irq_handler: + /* While we are still using puts in handlers, save all caller saved + regs. Eventually, some of these saves could be deferred. */ + addi sp,sp,-64 + sw ra, 0(sp) + sw a0, 4(sp) + sw a1, 8(sp) + sw a2, 12(sp) + sw a3, 16(sp) + sw a4, 20(sp) + sw a5, 24(sp) + sw a6, 28(sp) + sw a7, 32(sp) + sw t0, 36(sp) + sw t1, 40(sp) + sw t2, 44(sp) + sw t3, 48(sp) + sw t4, 52(sp) + sw t5, 56(sp) + sw t6, 60(sp) + csrr t0, mtvec + # Check for clic + andi t0, t0, 0x3 + addi t1, zero, 0x3 + # non-clic jump + bne t0, t1, 1f + # clic section (Filter out upper bits, mpp etc.) + csrr t0, mcause + lui t1, 0x1 + addi t1, t1, -1 + and t0, t1, t0 + j 2f + + 1: csrr t0, mcause + 2: li t1, EXCEPTION_INSN_ACCESS_FAULT + beq t0, t1, handle_insn_access_fault + li t1, EXCEPTION_ILLEGAL_INSN + beq t0, t1, handle_illegal_insn + li t1, EXCEPTION_ECALL_M + beq t0, t1, handle_ecall + li t1, EXCEPTION_ECALL_U + beq t0, t1, handle_ecall_u + li t1, EXCEPTION_BREAKPOINT + beq t0, t1, handle_ebreak + li t1, EXCEPTION_INSN_BUS_FAULT + beq t0, t1, handle_insn_bus_fault + j handle_unknown + +handle_ecall: + jal ra, handle_syscall + j end_handler_incr_mepc + +handle_ecall_u: + jal ra, handle_syscall + j end_handler_incr_mepc + +handle_ebreak: + /* TODO support debug handling requirements. */ + la a0, ebreak_msg + jal ra, puts + j end_handler_incr_mepc + +handle_illegal_insn: + la a0, illegal_insn_msg + jal ra, puts + j end_handler_incr_mepc + +handle_insn_access_fault: + la a0, insn_access_fault_msg + jal ra, puts + j end_handler_incr_mepc + +handle_insn_bus_fault: + la a0, insn_bus_fault_msg + jal ra, puts + /* Do not advnace the mepc, tests should handle this appropriately */ + j end_handler_ret + +handle_unknown: + la a0, unknown_msg + jal ra, puts + /* We don't know what interrupt/exception is being handled, so don't + increment mepc. */ + j end_handler_ret + +end_handler_incr_mepc: + csrr t0, mepc + lb t1, 0(t0) + li a0, 0x3 + and t1, t1, a0 + /* Increment mepc by 2 or 4 depending on whether the instruction at mepc + is compressed or not. */ + bne t1, a0, end_handler_incr_mepc2 + addi t0, t0, 2 +end_handler_incr_mepc2: + addi t0, t0, 2 + csrw mepc, t0 +end_handler_ret: + lw ra, 0(sp) + lw a0, 4(sp) + lw a1, 8(sp) + lw a2, 12(sp) + lw a3, 16(sp) + lw a4, 20(sp) + lw a5, 24(sp) + lw a6, 28(sp) + lw a7, 32(sp) + lw t0, 36(sp) + lw t1, 40(sp) + lw t2, 44(sp) + lw t3, 48(sp) + lw t4, 52(sp) + lw t5, 56(sp) + lw t6, 60(sp) + addi sp,sp,64 + mret + +.weak handle_data_load_bus_fault +.weak handle_data_store_bus_fault + +.section .nmi, "ax" +.global nmi_handler +.global nmi_end_handler_ret + +nmi_handler: + addi sp,sp,-64 + sw ra, 0(sp) + sw a0, 4(sp) + sw a1, 8(sp) + sw a2, 12(sp) + sw a3, 16(sp) + sw a4, 20(sp) + sw a5, 24(sp) + sw a6, 28(sp) + sw a7, 32(sp) + sw t0, 36(sp) + sw t1, 40(sp) + sw t2, 44(sp) + sw t3, 48(sp) + sw t4, 52(sp) + sw t5, 56(sp) + sw t6, 60(sp) + csrr t0, mcause + li t1, INTERRUPT_LOAD_BUS_FAULT + beq t0, t1, handle_data_load_bus_fault + li t1, INTERRUPT_STORE_BUS_FAULT + beq t0, t1, handle_data_store_bus_fault + + j nmi_end_handler_ret + +handle_data_load_bus_fault: + la a0, data_load_bus_fault_msg + jal ra, puts + j nmi_end_handler_ret + +handle_data_store_bus_fault: + la a0, data_store_bus_fault_msg + jal ra, puts + j nmi_end_handler_ret + +nmi_end_handler_ret: + lw ra, 0(sp) + lw a0, 4(sp) + lw a1, 8(sp) + lw a2, 12(sp) + lw a3, 16(sp) + lw a4, 20(sp) + lw a5, 24(sp) + lw a6, 28(sp) + lw a7, 32(sp) + lw t0, 36(sp) + lw t1, 40(sp) + lw t2, 44(sp) + lw t3, 48(sp) + lw t4, 52(sp) + lw t5, 56(sp) + lw t6, 60(sp) + addi sp,sp,64 + mret + +.section .rodata +data_load_bus_fault_msg: + .string "CVA6 BSP: data load bus fault exception handler entered\n" +data_store_bus_fault_msg: + .string "CVA6 BSP: data store bus fault exception handler entered\n" +insn_access_fault_msg: + .string "CVA6 BSP: instruction access fault exception handler entered\n" +insn_bus_fault_msg: + .string "CVA6 BSP: instruction bus fault exception handler entered\n" +illegal_insn_msg: + .string "CVA6 BSP: illegal instruction exception handler entered\n" +ecall_msg: + .string "CVA6 BSP: ecall exception handler entered\n" +ebreak_msg: + .string "CVA6 BSP: ebreak exception handler entered\n" +unknown_msg: + .string "CVA6 BSP: unknown exception handler entered\n" +no_exception_handler_msg: + .string "CVA6 BSP: no exception handler installed\n" diff --git a/verif/tests/custom/debug_test/bsp/link.ld b/verif/tests/custom/debug_test/bsp/link.ld new file mode 100644 index 0000000000..9d7181f3be --- /dev/null +++ b/verif/tests/custom/debug_test/bsp/link.ld @@ -0,0 +1,321 @@ +/* Script for -z combreloc */ +/* Copyright (C) 2014-2020 Free Software Foundation, Inc. + Copyright (C) 2019 ETH Zürich and University of Bologna + Copyright (C) 2020 OpenHW Group + Copying and distribution of this script, with or without modification, + are permitted in any medium without royalty provided the copyright + notice and this notice are preserved. */ + +/* This linker script is adapted from the default linker script for upstream + RISC-V GCC. It has been modified for use in verification of CORE-V cores. +*/ + +OUTPUT_FORMAT("elf32-littleriscv", "elf32-littleriscv", + "elf32-littleriscv") +OUTPUT_ARCH(riscv) +ENTRY(_start) + +/* CORE-V */ +MEMORY +{ + /* Our testbench is a bit weird in that we initialize the RAM (thus + allowing initialized sections to be placed there). Infact we dump all + sections to ram. */ + + ram (rwxai) : ORIGIN = 0x80000000, LENGTH = 0x400000 + dbg (rwxai) : ORIGIN = 0x800, LENGTH = 0x1000 +} + +SECTIONS +{ + /* CORE-V Debugger Code: This section address must be the same as the + DM_HaltAddress parameter in the RTL */ + .debugger (ORIGIN(dbg)): + { + PROVIDE(_debugger_start = .); + KEEP(*(.debugger)); + } >dbg + .debugger_exception (0x1000): + { + PROVIDE(_debugger_exception = .); + KEEP(*(.debugger_exception)); + } >dbg + /* Debugger Stack*/ + .debugger_stack : ALIGN(16) + { + PROVIDE(__debugger_stack_start = ALIGN(ORIGIN(dbg) + LENGTH(dbg) - 15, 16)); + } >dbg + + + /* CORE-V: crt0 init code */ + .init (ORIGIN(ram)): + { + KEEP (*(SORT_NONE(.init))) + KEEP (*(.text.start)) + } >ram + + . = ALIGN(0x1000); + .tohost : { *(.tohost) } + + /* CORE-V: interrupt vectors */ + . = ALIGN(0x1000); + .vectors : + { + . = ALIGN(0x1000); + PROVIDE(__vector_start = .); + KEEP(*(.vectors)); + } >ram + + + /* Read-only sections, merged into text segment: */ + PROVIDE (__executable_start = SEGMENT_START("text-segment", 0x10000)); . = SEGMENT_START("text-segment", 0x10000) + SIZEOF_HEADERS; + .interp : { *(.interp) } >ram + .note.gnu.build-id : { *(.note.gnu.build-id) } >ram + .hash : { *(.hash) } >ram + .gnu.hash : { *(.gnu.hash) } >ram + .dynsym : { *(.dynsym) } >ram + .dynstr : { *(.dynstr) } >ram + .gnu.version : { *(.gnu.version) } >ram + .gnu.version_d : { *(.gnu.version_d) } >ram + .gnu.version_r : { *(.gnu.version_r) } >ram + .rela.dyn : + { + *(.rela.init) + *(.rela.text .rela.text.* .rela.gnu.linkonce.t.*) + *(.rela.fini) + *(.rela.rodata .rela.rodata.* .rela.gnu.linkonce.r.*) + *(.rela.data .rela.data.* .rela.gnu.linkonce.d.*) + *(.rela.tdata .rela.tdata.* .rela.gnu.linkonce.td.*) + *(.rela.tbss .rela.tbss.* .rela.gnu.linkonce.tb.*) + *(.rela.ctors) + *(.rela.dtors) + *(.rela.got) + *(.rela.sdata .rela.sdata.* .rela.gnu.linkonce.s.*) + *(.rela.sbss .rela.sbss.* .rela.gnu.linkonce.sb.*) + *(.rela.sdata2 .rela.sdata2.* .rela.gnu.linkonce.s2.*) + *(.rela.sbss2 .rela.sbss2.* .rela.gnu.linkonce.sb2.*) + *(.rela.bss .rela.bss.* .rela.gnu.linkonce.b.*) + PROVIDE_HIDDEN (__rela_iplt_start = .); + *(.rela.iplt) + PROVIDE_HIDDEN (__rela_iplt_end = .); + } >ram + .rela.plt : + { + *(.rela.plt) + } >ram + + .plt : { *(.plt) } + .iplt : { *(.iplt) } + .text : + { + /* FIXME: the naming for text.tbljal will most likely change and move out of .text */ + . = ALIGN(1024); + *(.text.tbljal) + *(.text.unlikely .text.*_unlikely .text.unlikely.*) + *(.text.exit .text.exit.*) + *(.text.startup .text.startup.*) + *(.text.hot .text.hot.*) + *(SORT(.text.sorted.*)) + *(.text .stub .text.* .gnu.linkonce.t.*) + /* .gnu.warning sections are handled specially by elf.em. */ + *(.gnu.warning) + } >ram + .fini : + { + KEEP (*(SORT_NONE(.fini))) + } >ram + PROVIDE (__etext = .); + PROVIDE (_etext = .); + PROVIDE (etext = .); + .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) } >ram + .rodata1 : { *(.rodata1) } >ram + .sdata2 : + { + *(.sdata2 .sdata2.* .gnu.linkonce.s2.*) + } >ram + .sbss2 : { *(.sbss2 .sbss2.* .gnu.linkonce.sb2.*) } >ram + .eh_frame_hdr : { *(.eh_frame_hdr) *(.eh_frame_entry .eh_frame_entry.*) } >ram + .eh_frame : ONLY_IF_RO { KEEP (*(.eh_frame)) *(.eh_frame.*) } >ram + .gcc_except_table : ONLY_IF_RO { *(.gcc_except_table .gcc_except_table.*) } >ram + .gnu_extab : ONLY_IF_RO { *(.gnu_extab*) } >ram + /* These sections are generated by the Sun/Oracle C++ compiler. */ + .exception_ranges : ONLY_IF_RO { *(.exception_ranges*) } + /* Adjust the address for the data segment. We want to adjust up to + the same address within the page on the next page up. */ + . = DATA_SEGMENT_ALIGN (CONSTANT (MAXPAGESIZE), CONSTANT (COMMONPAGESIZE)); + /* Exception handling */ + .eh_frame : ONLY_IF_RW { KEEP (*(.eh_frame)) *(.eh_frame.*) } >ram + .gnu_extab : ONLY_IF_RW { *(.gnu_extab) } >ram + .gcc_except_table : ONLY_IF_RW { *(.gcc_except_table .gcc_except_table.*) } >ram + .exception_ranges : ONLY_IF_RW { *(.exception_ranges*) } >ram + /* Thread Local Storage sections */ + .tdata : + { + PROVIDE_HIDDEN (__tdata_start = .); + *(.tdata .tdata.* .gnu.linkonce.td.*) + } >ram + .tbss : { *(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon) } >ram + .preinit_array : + { + PROVIDE_HIDDEN (__preinit_array_start = .); + KEEP (*(.preinit_array)) + PROVIDE_HIDDEN (__preinit_array_end = .); + } >ram + .init_array : + { + PROVIDE_HIDDEN (__init_array_start = .); + KEEP (*(SORT_BY_INIT_PRIORITY(.init_array.*) SORT_BY_INIT_PRIORITY(.ctors.*))) + KEEP (*(.init_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .ctors)) + PROVIDE_HIDDEN (__init_array_end = .); + } >ram + .fini_array : + { + PROVIDE_HIDDEN (__fini_array_start = .); + KEEP (*(SORT_BY_INIT_PRIORITY(.fini_array.*) SORT_BY_INIT_PRIORITY(.dtors.*))) + KEEP (*(.fini_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .dtors)) + PROVIDE_HIDDEN (__fini_array_end = .); + } >ram + .ctors : + { + /* gcc uses crtbegin.o to find the start of + the constructors, so we make sure it is + first. Because this is a wildcard, it + doesn't matter if the user does not + actually link against crtbegin.o; the + linker won't look for a file to match a + wildcard. The wildcard also means that it + doesn't matter which directory crtbegin.o + is in. */ + KEEP (*crtbegin.o(.ctors)) + KEEP (*crtbegin?.o(.ctors)) + /* We don't want to include the .ctor section from + the crtend.o file until after the sorted ctors. + The .ctor section from the crtend file contains the + end of ctors marker and it must be last */ + KEEP (*(EXCLUDE_FILE (*crtend.o *crtend?.o ) .ctors)) + KEEP (*(SORT(.ctors.*))) + KEEP (*(.ctors)) + } >ram + .dtors : + { + KEEP (*crtbegin.o(.dtors)) + KEEP (*crtbegin?.o(.dtors)) + KEEP (*(EXCLUDE_FILE (*crtend.o *crtend?.o ) .dtors)) + KEEP (*(SORT(.dtors.*))) + KEEP (*(.dtors)) + } >ram + .jcr : { KEEP (*(.jcr)) } + .data.rel.ro : { *(.data.rel.ro.local* .gnu.linkonce.d.rel.ro.local.*) *(.data.rel.ro .data.rel.ro.* .gnu.linkonce.d.rel.ro.*) } + .dynamic : { *(.dynamic) } + . = DATA_SEGMENT_RELRO_END (0, .); + .data : + { + __DATA_BEGIN__ = .; + *(.data .data.* .gnu.linkonce.d.*) + SORT(CONSTRUCTORS) + } >ram + .data1 : { *(.data1) } >ram + .got : { *(.got.plt) *(.igot.plt) *(.got) *(.igot) } + /* We want the small data sections together, so single-instruction offsets + can access them all, and initialized data all before uninitialized, so + we can shorten the on-disk segment size. */ + .sdata : + { + __SDATA_BEGIN__ = .; + *(.srodata.cst16) *(.srodata.cst8) *(.srodata.cst4) *(.srodata.cst2) *(.srodata .srodata.*) + *(.sdata .sdata.* .gnu.linkonce.s.*) + } >ram + _edata = .; PROVIDE (edata = .); + . = .; + __bss_start = .; + .sbss : + { + *(.dynsbss) + *(.sbss .sbss.* .gnu.linkonce.sb.*) + *(.scommon) + } >ram + .bss : + { + *(.dynbss) + *(.bss .bss.* .gnu.linkonce.b.*) + *(COMMON) + /* Align here to ensure that the .bss section occupies space up to + _end. Align after .bss to ensure correct alignment even if the + .bss section disappears because there are no input sections. + FIXME: Why do we need it? When there is no .bss section, we do not + pad the .data section. */ + . = ALIGN(. != 0 ? 32 / 8 : 1); + } >ram + . = ALIGN(32 / 8); + . = SEGMENT_START("ldata-segment", .); + . = ALIGN(32 / 8); + __bss_end = .; + __global_pointer$ = MIN(__SDATA_BEGIN__ + 0x800, + MAX(__DATA_BEGIN__ + 0x800, __bss_end - 0x800)); + _end = .; PROVIDE (end = .); + . = DATA_SEGMENT_END (.); + + /* Heap grows upward towards end of ram */ + .heap : ALIGN(16) + { + PROVIDE(__heap_start = .); + /* If end of ram is not 16-byte aligned, align to previous 16-byte + boundary */ + PROVIDE(__heap_end = ALIGN(ORIGIN(ram) + LENGTH(ram) - 15, 16)); + . = __heap_end; + } >ram + + /* Stack grows downward from end of ram */ + .stack (__heap_end) : ALIGN(16) /* this is a requirement of the ABI(?) */ + { + PROVIDE(__stack_start = __heap_end); + . = __stack_start; + } >ram + + + + + /* Stabs debugging sections. */ + .stab 0 : { *(.stab) } + .stabstr 0 : { *(.stabstr) } + .stab.excl 0 : { *(.stab.excl) } + .stab.exclstr 0 : { *(.stab.exclstr) } + .stab.index 0 : { *(.stab.index) } + .stab.indexstr 0 : { *(.stab.indexstr) } + .comment 0 : { *(.comment) } + .gnu.build.attributes : { *(.gnu.build.attributes .gnu.build.attributes.*) } + /* DWARF debug sections. + Symbols in the DWARF debugging sections are relative to the beginning + of the section so we begin them at 0. */ + /* DWARF 1 */ + .debug 0 : { *(.debug) } + .line 0 : { *(.line) } + /* GNU DWARF 1 extensions */ + .debug_srcinfo 0 : { *(.debug_srcinfo) } + .debug_sfnames 0 : { *(.debug_sfnames) } + /* DWARF 1.1 and DWARF 2 */ + .debug_aranges 0 : { *(.debug_aranges) } + .debug_pubnames 0 : { *(.debug_pubnames) } + /* DWARF 2 */ + .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) } + .debug_abbrev 0 : { *(.debug_abbrev) } + .debug_line 0 : { *(.debug_line .debug_line.* .debug_line_end) } + .debug_frame 0 : { *(.debug_frame) } + .debug_str 0 : { *(.debug_str) } + .debug_loc 0 : { *(.debug_loc) } + .debug_macinfo 0 : { *(.debug_macinfo) } + /* SGI/MIPS DWARF 2 extensions */ + .debug_weaknames 0 : { *(.debug_weaknames) } + .debug_funcnames 0 : { *(.debug_funcnames) } + .debug_typenames 0 : { *(.debug_typenames) } + .debug_varnames 0 : { *(.debug_varnames) } + /* DWARF 3 */ + .debug_pubtypes 0 : { *(.debug_pubtypes) } + .debug_ranges 0 : { *(.debug_ranges) } + /* DWARF Extension. */ + .debug_macro 0 : { *(.debug_macro) } + .debug_addr 0 : { *(.debug_addr) } + .gnu.attributes 0 : { KEEP (*(.gnu.attributes)) } + /DISCARD/ : { *(.note.GNU-stack) *(.gnu_debuglink) *(.gnu.lto_*) } +} + diff --git a/verif/tests/custom/debug_test/bsp/link_corev-dv.ld b/verif/tests/custom/debug_test/bsp/link_corev-dv.ld new file mode 100644 index 0000000000..e581f98659 --- /dev/null +++ b/verif/tests/custom/debug_test/bsp/link_corev-dv.ld @@ -0,0 +1,288 @@ +/* Copyright (C) 2014-2020 Free Software Foundation, Inc. + Copyright (C) 2019 ETH Zürich and University of Bologna + Copyright (C) 2020 OpenHW Group + Copying and distribution of this script, with or without modification, + are permitted in any medium without royalty provided the copyright + notice and this notice are preserved. */ + +/* This linker script is adapted from the default linker script for upstream + RISC-V GCC. It has been modified for use in verification of CORE-V cores. +*/ + +OUTPUT_FORMAT("elf32-littleriscv", "elf32-littleriscv", + "elf32-littleriscv") +OUTPUT_ARCH(riscv) +ENTRY(_start) + +/* + * The only changes from the default linker file are: + * - MEMORY section will be generated into its own file, linkcmds.memory + * - debug part of SECTIONS removed and generated (if desired) into its own linkcmds.dbgsec + * - New SECTIONS defined for generated pma-adapted memory regions + */ + +/* Memory layout */ +INCLUDE linkcmds.memory +/* Debug section */ +INCLUDE linkcmds.dbgsec +/* PMA sections */ +INCLUDE linkcmds.pmasec +/* Fixed address sections - Must be dynamically relocatable by generator script */ +INCLUDE linkcmds.fixadd + +SECTIONS +{ + + /* CORE-V: crt0 init code */ + .init (__boot_address): + { + KEEP (*(SORT_NONE(.init))) + KEEP (*(.text.start)) + } >rom + + /* Read-only sections, merged into text segment: */ + PROVIDE (__executable_start = SEGMENT_START("text-segment", 0x10000)); . = SEGMENT_START("text-segment", 0x10000) + SIZEOF_HEADERS; + .interp : { *(.interp) } >ram + .note.gnu.build-id : { *(.note.gnu.build-id) } >ram + .hash : { *(.hash) } >ram + .gnu.hash : { *(.gnu.hash) } >ram + .dynsym : { *(.dynsym) } >ram + .dynstr : { *(.dynstr) } >ram + .gnu.version : { *(.gnu.version) } >ram + .gnu.version_d : { *(.gnu.version_d) } >ram + .gnu.version_r : { *(.gnu.version_r) } >ram + .rela.dyn : + { + *(.rela.init) + *(.rela.text .rela.text.* .rela.gnu.linkonce.t.*) + *(.rela.fini) + *(.rela.rodata .rela.rodata.* .rela.gnu.linkonce.r.*) + *(.rela.data .rela.data.* .rela.gnu.linkonce.d.*) + *(.rela.tdata .rela.tdata.* .rela.gnu.linkonce.td.*) + *(.rela.tbss .rela.tbss.* .rela.gnu.linkonce.tb.*) + *(.rela.ctors) + *(.rela.dtors) + *(.rela.got) + *(.rela.sdata .rela.sdata.* .rela.gnu.linkonce.s.*) + *(.rela.sbss .rela.sbss.* .rela.gnu.linkonce.sb.*) + *(.rela.sdata2 .rela.sdata2.* .rela.gnu.linkonce.s2.*) + *(.rela.sbss2 .rela.sbss2.* .rela.gnu.linkonce.sb2.*) + *(.rela.bss .rela.bss.* .rela.gnu.linkonce.b.*) + PROVIDE_HIDDEN (__rela_iplt_start = .); + *(.rela.iplt) + PROVIDE_HIDDEN (__rela_iplt_end = .); + } >rom + .rela.plt : + { + *(.rela.plt) + } >rom + + .plt : { *(.plt) } + .iplt : { *(.iplt) } + .text : + { + *(.text.unlikely .text.*_unlikely .text.unlikely.*) + *(.text.exit .text.exit.*) + *(.text.startup .text.startup.*) + *(.text.hot .text.hot.*) + *(SORT(.text.sorted.*)) + *(.text .stub .text.* .gnu.linkonce.t.*) + /* .gnu.warning sections are handled specially by elf.em. */ + *(.gnu.warning) + } >rom + .fini : + { + KEEP (*(SORT_NONE(.fini))) + } >rom + PROVIDE (__etext = .); + PROVIDE (_etext = .); + PROVIDE (etext = .); + .rodata : { *(.rodata .rodata.* .gnu.linkonce.r.*) } >ram + .rodata1 : { *(.rodata1) } >ram + .sdata2 : + { + *(.sdata2 .sdata2.* .gnu.linkonce.s2.*) + } >rom + .sbss2 : { *(.sbss2 .sbss2.* .gnu.linkonce.sb2.*) } >ram + .eh_frame_hdr : { *(.eh_frame_hdr) *(.eh_frame_entry .eh_frame_entry.*) } >ram + .eh_frame : ONLY_IF_RO { KEEP (*(.eh_frame)) *(.eh_frame.*) } >ram + .gcc_except_table : ONLY_IF_RO { *(.gcc_except_table .gcc_except_table.*) } >ram + .gnu_extab : ONLY_IF_RO { *(.gnu_extab*) } >ram + /* These sections are generated by the Sun/Oracle C++ compiler. */ + .exception_ranges : ONLY_IF_RO { *(.exception_ranges*) } + /* Adjust the address for the data segment. We want to adjust up to + the same address within the page on the next page up. */ + . = DATA_SEGMENT_ALIGN (CONSTANT (MAXPAGESIZE), CONSTANT (COMMONPAGESIZE)); + /* Exception handling */ + .eh_frame : ONLY_IF_RW { KEEP (*(.eh_frame)) *(.eh_frame.*) } >ram + .gnu_extab : ONLY_IF_RW { *(.gnu_extab) } >ram + .gcc_except_table : ONLY_IF_RW { *(.gcc_except_table .gcc_except_table.*) } >ram + .exception_ranges : ONLY_IF_RW { *(.exception_ranges*) } >ram + /* Thread Local Storage sections */ + .tdata : + { + PROVIDE_HIDDEN (__tdata_start = .); + *(.tdata .tdata.* .gnu.linkonce.td.*) + } >ram + .tbss : { *(.tbss .tbss.* .gnu.linkonce.tb.*) *(.tcommon) } >ram + .preinit_array : + { + PROVIDE_HIDDEN (__preinit_array_start = .); + KEEP (*(.preinit_array)) + PROVIDE_HIDDEN (__preinit_array_end = .); + } >ram + .init_array : + { + PROVIDE_HIDDEN (__init_array_start = .); + KEEP (*(SORT_BY_INIT_PRIORITY(.init_array.*) SORT_BY_INIT_PRIORITY(.ctors.*))) + KEEP (*(.init_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .ctors)) + PROVIDE_HIDDEN (__init_array_end = .); + } >ram + .fini_array : + { + PROVIDE_HIDDEN (__fini_array_start = .); + KEEP (*(SORT_BY_INIT_PRIORITY(.fini_array.*) SORT_BY_INIT_PRIORITY(.dtors.*))) + KEEP (*(.fini_array EXCLUDE_FILE (*crtbegin.o *crtbegin?.o *crtend.o *crtend?.o ) .dtors)) + PROVIDE_HIDDEN (__fini_array_end = .); + } >ram + .ctors : + { + /* gcc uses crtbegin.o to find the start of + the constructors, so we make sure it is + first. Because this is a wildcard, it + doesn't matter if the user does not + actually link against crtbegin.o; the + linker won't look for a file to match a + wildcard. The wildcard also means that it + doesn't matter which directory crtbegin.o + is in. */ + KEEP (*crtbegin.o(.ctors)) + KEEP (*crtbegin?.o(.ctors)) + /* We don't want to include the .ctor section from + the crtend.o file until after the sorted ctors. + The .ctor section from the crtend file contains the + end of ctors marker and it must be last */ + KEEP (*(EXCLUDE_FILE (*crtend.o *crtend?.o ) .ctors)) + KEEP (*(SORT(.ctors.*))) + KEEP (*(.ctors)) + } >ram + .dtors : + { + KEEP (*crtbegin.o(.dtors)) + KEEP (*crtbegin?.o(.dtors)) + KEEP (*(EXCLUDE_FILE (*crtend.o *crtend?.o ) .dtors)) + KEEP (*(SORT(.dtors.*))) + KEEP (*(.dtors)) + } >ram + .jcr : { KEEP (*(.jcr)) } + .data.rel.ro : { *(.data.rel.ro.local* .gnu.linkonce.d.rel.ro.local.*) *(.data.rel.ro .data.rel.ro.* .gnu.linkonce.d.rel.ro.*) } + .dynamic : { *(.dynamic) } + . = DATA_SEGMENT_RELRO_END (0, .); + .data : + { + __DATA_BEGIN__ = .; + *(.data .data.* .gnu.linkonce.d.*) + SORT(CONSTRUCTORS) + } >ram + .data1 : { *(.data1) } >ram + .got : { *(.got.plt) *(.igot.plt) *(.got) *(.igot) } + /* We want the small data sections together, so single-instruction offsets + can access them all, and initialized data all before uninitialized, so + we can shorten the on-disk segment size. */ + .sdata : + { + __SDATA_BEGIN__ = .; + *(.srodata.cst16) *(.srodata.cst8) *(.srodata.cst4) *(.srodata.cst2) *(.srodata .srodata.*) + *(.sdata .sdata.* .gnu.linkonce.s.*) + } >ram + _edata = .; PROVIDE (edata = .); + . = .; + __bss_start = .; + .sbss : + { + *(.dynsbss) + *(.sbss .sbss.* .gnu.linkonce.sb.*) + *(.scommon) + } >ram + .bss : + { + *(.dynbss) + *(.bss .bss.* .gnu.linkonce.b.*) + *(COMMON) + /* Align here to ensure that the .bss section occupies space up to + _end. Align after .bss to ensure correct alignment even if the + .bss section disappears because there are no input sections. + FIXME: Why do we need it? When there is no .bss section, we do not + pad the .data section. */ + . = ALIGN(. != 0 ? 32 / 8 : 1); + } >ram + . = ALIGN(32 / 8); + . = SEGMENT_START("ldata-segment", .); + . = ALIGN(32 / 8); + __bss_end = .; + __global_pointer$ = MIN(__SDATA_BEGIN__ + 0x800, + MAX(__DATA_BEGIN__ + 0x800, __bss_end - 0x800)); + _end = .; PROVIDE (end = .); + . = DATA_SEGMENT_END (.); + + /* Heap grows upward towards end of ram */ + .heap : ALIGN(16) + { + PROVIDE(__heap_start = .); + /* If end of ram is not 16-byte aligned, align to previous 16-byte + boundary */ + PROVIDE(__heap_end = ALIGN(ORIGIN(ram) + LENGTH(ram) - 15, 16)); + . = __heap_end; + } >ram + + /* Stack grows downward from end of ram */ + .stack (__heap_end) : ALIGN(16) /* this is a requirement of the ABI(?) */ + { + PROVIDE(__stack_start = __heap_end); + . = __stack_start; + } >ram + + /* Stabs debugging sections. */ + .stab 0 : { *(.stab) } + .stabstr 0 : { *(.stabstr) } + .stab.excl 0 : { *(.stab.excl) } + .stab.exclstr 0 : { *(.stab.exclstr) } + .stab.index 0 : { *(.stab.index) } + .stab.indexstr 0 : { *(.stab.indexstr) } + .comment 0 : { *(.comment) } + .gnu.build.attributes : { *(.gnu.build.attributes .gnu.build.attributes.*) } + /* DWARF debug sections. + Symbols in the DWARF debugging sections are relative to the beginning + of the section so we begin them at 0. */ + /* DWARF 1 */ + .debug 0 : { *(.debug) } + .line 0 : { *(.line) } + /* GNU DWARF 1 extensions */ + .debug_srcinfo 0 : { *(.debug_srcinfo) } + .debug_sfnames 0 : { *(.debug_sfnames) } + /* DWARF 1.1 and DWARF 2 */ + .debug_aranges 0 : { *(.debug_aranges) } + .debug_pubnames 0 : { *(.debug_pubnames) } + /* DWARF 2 */ + .debug_info 0 : { *(.debug_info .gnu.linkonce.wi.*) } + .debug_abbrev 0 : { *(.debug_abbrev) } + .debug_line 0 : { *(.debug_line .debug_line.* .debug_line_end) } + .debug_frame 0 : { *(.debug_frame) } + .debug_str 0 : { *(.debug_str) } + .debug_loc 0 : { *(.debug_loc) } + .debug_macinfo 0 : { *(.debug_macinfo) } + /* SGI/MIPS DWARF 2 extensions */ + .debug_weaknames 0 : { *(.debug_weaknames) } + .debug_funcnames 0 : { *(.debug_funcnames) } + .debug_typenames 0 : { *(.debug_typenames) } + .debug_varnames 0 : { *(.debug_varnames) } + /* DWARF 3 */ + .debug_pubtypes 0 : { *(.debug_pubtypes) } + .debug_ranges 0 : { *(.debug_ranges) } + /* DWARF Extension. */ + .debug_macro 0 : { *(.debug_macro) } + .debug_addr 0 : { *(.debug_addr) } + .gnu.attributes 0 : { KEEP (*(.gnu.attributes)) } + /DISCARD/ : { *(.note.GNU-stack) *(.gnu_debuglink) *(.gnu.lto_*) } +} + diff --git a/verif/tests/custom/debug_test/bsp/syscalls.c b/verif/tests/custom/debug_test/bsp/syscalls.c new file mode 100644 index 0000000000..f3833ba4fb --- /dev/null +++ b/verif/tests/custom/debug_test/bsp/syscalls.c @@ -0,0 +1,379 @@ +/* An extremely minimalist syscalls.c for newlib + * Based on riscv newlib libgloss/riscv/sys_*.c + * + * Copyright 2019 Claire Wolf + * Copyright 2019 ETH Zürich and University of Bologna + * + * Permission to use, copy, modify, and/or distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR + * OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "corev_uvmt.h" +#undef errno +extern int errno; + +/* write to this reg for outputting strings */ +#define STDOUT_REG CV_VP_VIRTUAL_PRINTER_BASE +/* write test result of program to this reg */ +#define RESULT_REG (CV_VP_STATUS_FLAGS_BASE) +/* write exit value of program to this reg */ +#define EXIT_REG (CV_VP_STATUS_FLAGS_BASE + 4) + +#define STDOUT_FILENO 1 + +/* It turns out that older newlib versions use different symbol names which goes + * against newlib recommendations. Anyway this is fixed in later version. + */ +#if __NEWLIB__ <= 2 && __NEWLIB_MINOR__ <= 5 +#define _sbrk sbrk +#define _write write +#define _close close +#define _lseek lseek +#define _read read +#define _fstat fstat +#define _isatty isatty +#endif +/* Upstream newlib now defines this in libgloss/riscv/internal_syscall.h. */ +long +__syscall_error(long a0) +{ + errno = -a0; + return -1; +} + +void unimplemented_syscall() +{ + const char *p = "BSP: Unimplemented system call called!\n"; + while (*p) + *(volatile int *)STDOUT_REG = *(p++); +} + +int nanosleep(const struct timespec *rqtp, struct timespec *rmtp) +{ + errno = ENOSYS; + return -1; +} + +int _access(const char *file, int mode) +{ + errno = ENOSYS; + return -1; +} + +int _chdir(const char *path) +{ + errno = ENOSYS; + return -1; +} + +int _chmod(const char *path, mode_t mode) +{ + errno = ENOSYS; + return -1; +} + +int _chown(const char *path, uid_t owner, gid_t group) +{ + errno = ENOSYS; + return -1; +} + +int _close(int file) +{ + return -1; +} + +int _execve(const char *name, char *const argv[], char *const env[]) +{ + errno = ENOMEM; + return -1; +} + +void _exit(int exit_status) +{ + *(volatile int *)EXIT_REG = exit_status; + __asm__ volatile("wfi"); + /* _exit should not return */ + while (1) {}; +} + +int _faccessat(int dirfd, const char *file, int mode, int flags) +{ + errno = ENOSYS; + return -1; +} + +int _fork(void) +{ + errno = EAGAIN; + return -1; +} + +int _fstat(int file, struct stat *st) +{ + st->st_mode = S_IFCHR; + return 0; + // errno = -ENOSYS; + // return -1; +} + +int _fstatat(int dirfd, const char *file, struct stat *st, int flags) +{ + errno = ENOSYS; + return -1; +} + +int _ftime(struct timeb *tp) +{ + errno = ENOSYS; + return -1; +} + +char *_getcwd(char *buf, size_t size) +{ + errno = -ENOSYS; + return NULL; +} + +int _getpid() +{ + return 1; +} + +int _gettimeofday(struct timeval *tp, void *tzp) +{ + errno = -ENOSYS; + return -1; +} + +int _isatty(int file) +{ + return (file == STDOUT_FILENO); +} + +int _kill(int pid, int sig) +{ + errno = EINVAL; + return -1; +} + +int _link(const char *old_name, const char *new_name) +{ + errno = EMLINK; + return -1; +} + +off_t _lseek(int file, off_t ptr, int dir) +{ + return 0; +} + +int _lstat(const char *file, struct stat *st) +{ + errno = ENOSYS; + return -1; +} + +int _open(const char *name, int flags, int mode) +{ + return -1; +} + +int _openat(int dirfd, const char *name, int flags, int mode) +{ + errno = ENOSYS; + return -1; +} + +ssize_t _read(int file, void *ptr, size_t len) +{ + return 0; +} + +int _stat(const char *file, struct stat *st) +{ + st->st_mode = S_IFCHR; + return 0; + // errno = ENOSYS; + // return -1; +} + +long _sysconf(int name) +{ + + return -1; +} + +clock_t _times(struct tms *buf) +{ + return -1; +} + +int _unlink(const char *name) +{ + errno = ENOENT; + return -1; +} + +int _utime(const char *path, const struct utimbuf *times) +{ + errno = ENOSYS; + return -1; +} + +int _wait(int *status) +{ + errno = ECHILD; + return -1; +} + +ssize_t _write(int file, const void *ptr, size_t len) +{ + const char *cptr = (char *)ptr; + if (file != STDOUT_FILENO) + { + errno = ENOSYS; + return -1; + } + + const void *eptr = cptr + len; + while (cptr != eptr) + *(volatile int *)STDOUT_REG = *cptr++; + return len; +} + +extern char __heap_start[]; +extern char __heap_end[]; +static char *brk = __heap_start; + +int _brk(void *addr) +{ + brk = addr; + return 0; +} + +void *_sbrk(ptrdiff_t incr) +{ + char *old_brk = brk; + volatile uint32_t sp; + + char *new_brk = brk += incr; + __asm__ volatile("mv %0, x2" : "=r"(sp) : : ); + + if (new_brk < (char *) sp && new_brk < __heap_end) + { + brk = new_brk; + + return old_brk; + } + else + { + errno = ENOMEM; + return (void *) -1; + } +} + +void handle_syscall (long a0, + long a1, + long a2, + long a3, + __attribute__((unused)) long a4, + __attribute__((unused)) long a5, + __attribute__((unused)) long a6, + long a7) { + #ifdef __riscv_32e + register long syscall_id asm("t0"); + #else + long syscall_id = a7; + #endif + + switch (syscall_id) { + case SYS_exit: + _exit (a0); + break; + case SYS_read: + _read (a0, (void *) a1, a2); + break; + case SYS_write: + _write (a0, (const void *) a1, a2); + break; + case SYS_getpid: + _getpid (); + break; + case SYS_kill: + _kill (a0, a1); + break; + case SYS_open: + _open ((const char *) a0, a1, a2); + break; + case SYS_openat: + _openat (a0, (const char *) a1, a2, a3); + break; + case SYS_close: + _close (a0); + break; + case SYS_lseek: + _lseek (a0, a1, a2); + break; + case SYS_brk: + _brk ((void *) a0); + break; + case SYS_link: + _link ((const char *) a0, (const char *) a1); + break; + case SYS_unlink: + _unlink ((const char *) a0); + break; + case SYS_chdir: + _chdir ((const char *) a0); + break; + case SYS_getcwd: + _getcwd ((char *) a0, a1); + break; + case SYS_stat: + _stat ((const char *) a0, (struct stat *) a1); + break; + case SYS_fstat: + _fstat (a0, (struct stat *) a1); + break; + case SYS_lstat: + _lstat ((const char *) a0, (struct stat *) a1); + break; + case SYS_fstatat: + _fstatat (a0, (const char *) a1, (struct stat *) a2, a3); + break; + case SYS_access: + _access ((const char *) a0, a1); + break; + case SYS_faccessat: + _faccessat (a0, (const char *) a1, a2, a3); + break; + case SYS_gettimeofday: + _gettimeofday ((struct timeval *) a0, (void *) a1); + break; + case SYS_times: + _times ((struct tms *) a0); + break; + default: + unimplemented_syscall (); + break; + } +} diff --git a/verif/tests/custom/debug_test/bsp/vectors.S b/verif/tests/custom/debug_test/bsp/vectors.S new file mode 100644 index 0000000000..1f97e896de --- /dev/null +++ b/verif/tests/custom/debug_test/bsp/vectors.S @@ -0,0 +1,55 @@ +/* +* Copyright 2019 ETH Zürich and University of Bologna +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +.section .vectors, "ax" +.option push +.option norvc +.global vector_table + +vector_table: + j u_sw_irq_handler + j __no_irq_handler + j __no_irq_handler + j m_software_irq_handler + j __no_irq_handler + j __no_irq_handler + j __no_irq_handler + j m_timer_irq_handler + j __no_irq_handler + j __no_irq_handler + j __no_irq_handler + j m_external_irq_handler + j __no_irq_handler + j __no_irq_handler + j __no_irq_handler + j __no_irq_handler + j m_fast0_irq_handler + j m_fast1_irq_handler + j m_fast2_irq_handler + j m_fast3_irq_handler + j m_fast4_irq_handler + j m_fast5_irq_handler + j m_fast6_irq_handler + j m_fast7_irq_handler + j m_fast8_irq_handler + j m_fast9_irq_handler + j m_fast10_irq_handler + j m_fast11_irq_handler + j m_fast12_irq_handler + j m_fast13_irq_handler + j m_fast14_irq_handler + j m_fast15_irq_handler +.option pop diff --git a/verif/tests/custom/debug_test/debug_test.c b/verif/tests/custom/debug_test/debug_test.c new file mode 100644 index 0000000000..5111d735c4 --- /dev/null +++ b/verif/tests/custom/debug_test/debug_test.c @@ -0,0 +1,553 @@ +/* +** +** Copyright 2020 OpenHW Group +** +** Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); +** you may not use this file except in compliance with the License. +** You may obtain a copy of the License at +** +** https://solderpad.org/licenses/ +** +** Unless required by applicable law or agreed to in writing, software +** distributed under the License is distributed on an "AS IS" BASIS, +** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +** See the License for the specific language governing permissions and +** limitations under the License. +** +******************************************************************************* +** Basic debugger test. Needs more work and bugs fixed +** It will launch a debug request and have debugger code execute (debugger.S) +******************************************************************************* +*/ + +#include +#include +#include +#include "corev_uvmt.h" + +volatile int glb_hart_status = 0; // Written by main code only, read by debug code +volatile int glb_debug_status = 0; // Written by debug code only, read by main code +volatile int glb_ebreak_status = 0; // Written by ebreak code only, read by main code +volatile int glb_illegal_insn_status = 0; // Written by illegal instruction code only, read by main code +volatile int glb_debug_exception_status = 0; // Written by debug code during exception only +volatile int glb_exception_ebreak_status = 0; // Written by main code, read by exception handler + +volatile int glb_previous_dpc = 0; // holds last dpc, used for checking correctness of stepping +volatile int glb_step_info = 0; // info to dbg code about actions to take on stepping +volatile int glb_step_count = 0; // Written by debug code for each time single step is entered +// Expectation flags. Raise an error if handler or routine is enterred when not expected, +volatile int glb_expect_illegal_insn = 0; +volatile int glb_expect_ebreak_handler = 0; +volatile int glb_expect_debug_entry = 0; +volatile int glb_expect_debug_exception = 0; +volatile int glb_expect_irq_entry = 0; +volatile int glb_irq_timeout = 0; +// Counter values +// Checked at start and end of debug code +// Only lower 32 bits checked, as simulation cannot overflow on 32 bits +volatile int glb_mcycle_start = 0; +volatile int glb_mcycle_end = 0; +volatile int glb_minstret_start = 0; +volatile int glb_minstret_end = 0; +#define TEST_FAILED *(volatile int *)CV_VP_STATUS_FLAGS_BASE = 1 +#define TEST_PASSED *(volatile int *)CV_VP_STATUS_FLAGS_BASE = 123456789 + +extern int __stack_start; +typedef union { + struct { + unsigned int start_delay : 15; // 14: 0 + unsigned int rand_start_delay : 1; // 15 + unsigned int pulse_width : 13; // 28:16 + unsigned int rand_pulse_width : 1; // 29 + unsigned int pulse_mode : 1; // 30 0 = level, 1 = pulse + unsigned int value : 1; // 31 + } fields; + unsigned int bits; +} debug_req_control_t; + +#define DEBUG_REQ_CONTROL_REG *((volatile uint32_t *) (CV_VP_DEBUG_CONTROL_BASE)) +#define TIMER_REG_ADDR ((volatile uint32_t *) (CV_VP_INTR_TIMER_BASE+0)) +#define TIMER_VAL_ADDR ((volatile uint32_t *) (CV_VP_INTR_TIMER_BASE+4)) + +typedef union { + struct { + unsigned int uie : 1; // 0 // Implemented if USER mode enabled + unsigned int sie : 1; // 1 + unsigned int wpri : 1; // 2 + unsigned int mie : 1; // 3 // Implemented + unsigned int upie : 1; // 4 // Implemented if USER mode enabled + unsigned int spie : 1; // 5 + unsigned int wpri0 : 1; // 6 + unsigned int mpie : 1; // 7 // Implemented + unsigned int spp : 1; // 8 + unsigned int wpri1 : 2; // 10: 9 + unsigned int mpp : 2; // 12:11 // Implemented + unsigned int fs : 2; // 14:13 + unsigned int xs : 2; // 16:15 + unsigned int mprv : 1; // 17 + unsigned int sum : 1; // 18 + unsigned int mxr : 1; // 19 + unsigned int tvm : 1; // 20 + unsigned int tw : 1; // 21 + unsigned int tsr : 1; // 22 + unsigned int wpri3 : 8; // 30:23 + unsigned int sd : 1; // 31 + } fields; + unsigned int bits; +} mstatus_t; + +extern void _single_step(int d); +// Tag is simply to help debug and determine where the failure came from +void check_debug_status(int tag, int value) +{ + if(glb_debug_status != value){ + printf("ERROR: check_debug_status(%d, %d): Tag=%d status=%d, exp=%d \n\n", + tag, value, tag, glb_debug_status, value); + TEST_FAILED; + } +} +void check_debug_exception_status(int tag, int value) +{ + if(glb_debug_exception_status != value){ + printf("ERROR: check_debug_exception_status(%d, %d): Tag=%d status=%d, exp=%d \n\n", + tag, value, tag, glb_debug_exception_status, value); + TEST_FAILED; + } +} +void check_hart_status(int tag, int value) +{ + if(glb_hart_status != value){ + printf("ERROR: check_hart_status(%d, %d): Tag=%d status=%d, exp=%d \n\n", + tag, value, tag, glb_hart_status, value); + TEST_FAILED; + } +} +void check_ebreak_status(int tag, int value) +{ + if(glb_ebreak_status != value){ + printf("ERROR: check_ebreak_status(%d, %d): Tag=%d status=%d, exp=%d \n\n", + tag, value, tag, glb_ebreak_status, value); + TEST_FAILED; + } +} +void check_illegal_insn_status(int tag, int value) +{ + if(glb_illegal_insn_status != value){ + printf("ERROR: check_illegal_insn_status(%d, %d): Tag=%d status=%d, exp=%d \n\n", + tag, value, tag, glb_illegal_insn_status, value); + TEST_FAILED; + } +} +void delay(int count) { + for (volatile int d = 0; d < count; d++); +} + +void mstatus_mie_enable() { + int mie_bit = 0x1 << 3; + asm volatile("csrrs x0, mstatus, %0" : : "r" (mie_bit)); +} + +void mstatus_mie_disable() { + int mie_bit = 0x1 << 3; + asm volatile("csrrc x0, mstatus, %0" : : "r" (mie_bit)); +} + +void mie_enable_all() { + uint32_t mie_mask = (uint32_t) -1; + asm volatile("csrrs x0, mie, %0" : : "r" (mie_mask)); +} + +void mie_disable_all() { + uint32_t mie_mask = (uint32_t) -1; + asm volatile("csrrc x0, mie, %0" : : "r" (mie_mask)); +} + +void mie_enable(uint32_t irq) { + // Enable the interrupt irq in MIE + uint32_t mie_bit = 0x1 << irq; + asm volatile("csrrs x0, mie, %0" : : "r" (mie_bit)); +} + +void mie_disable(uint32_t irq) { + // Disable the interrupt irq in MIE + uint32_t mie_bit = 0x1 << irq; + asm volatile("csrrc x0, mie, %0" : : "r" (mie_bit)); +} + +void mm_ram_assert_irq(uint32_t mask, uint32_t cycle_delay) { + *TIMER_REG_ADDR = mask; + *TIMER_VAL_ADDR = 1 + cycle_delay; +} + +void counters_enable() { + // Enable counters mcycle (bit0) and minstret (bit2) + uint32_t mask = 1<<2 | 1<<0; + asm volatile("csrrc x0, 0x320, %0" : : "r" (mask)); +} +#define MACHINE 3 +int main(int argc, char *argv[]) +{ + unsigned int temp,temp1,temp2; + debug_req_control_t debug_req_control; + mstatus_t mstatus, mstatus_cmp; + counters_enable(); + printf("\nBasic test checking debug functionality.\n"); + + printf("------------------------\n"); + printf(" Test1: check initialization values\n"); + //check_debug_status(0,0); + + temp1 = 0xFFFFFFFF; + /* get relevant CSRs and compare init values*/ + __asm__ volatile("csrr %0, mstatus" : "=r"(temp1)); + __asm__ volatile("csrw mstatus, %0 " : "=r"(temp1)); + __asm__ volatile("csrr %0, mstatus" : "=r"(mstatus.bits)); + __asm__ volatile("csrr %0, mie" : "=r"(temp)); + __asm__ volatile("csrw mie, %0 " : "=r"(temp1)); + __asm__ volatile("csrr %0, mie" : "=r"(temp2)); + printf("\tmstats_rval = 0x%0x 0x%0x 0x%0x 0x%0x\n",temp2,mstatus.bits,temp,temp1); + + check_debug_status(0,0); + printf("------------------------\n"); + printf(" Test2.1: check access to Debug and Trigger registers\n"); + // debug specifcation 13.2: 4.8 Core Debug Registers are not accessable unless in debug mode + + // ---------------------- + // Check Debug Write Access + temp = 0xFFFFFFFF; + temp1 = glb_illegal_insn_status+1; + glb_expect_illegal_insn = 1; + __asm__ volatile("csrw dcsr, %0" : "=r"(temp)); // Debug DCSR + check_illegal_insn_status(11,temp1++); + glb_expect_illegal_insn = 1; + __asm__ volatile("csrw dpc, %0" : "=r"(temp)); // Debug DPC + check_illegal_insn_status(12,temp1++); + glb_expect_illegal_insn = 1; + __asm__ volatile("csrw dscratch, %0" : "=r"(temp)); // Debug DSCRATCH0 + check_illegal_insn_status(13,temp1++); + glb_expect_illegal_insn = 1; + __asm__ volatile("csrw 0x7b3, %0" : "=r"(temp)); // Debug DSCRATCH1 + check_illegal_insn_status(14,temp1++); + + // Check Read Access + temp1 = glb_illegal_insn_status+1; + // Allow illegal instruction handler to run + glb_expect_illegal_insn = 1; + __asm__ volatile("csrr %0, dcsr" : "=r"(temp)); // Debug DCSR + check_illegal_insn_status(1,temp1++); + glb_expect_illegal_insn = 1; + __asm__ volatile("csrr %0, dpc" : "=r"(temp)); // Debug DPC + check_illegal_insn_status(2,temp1++); + glb_expect_illegal_insn = 1; + __asm__ volatile("csrr %0, dscratch": "=r"(temp)); // Debug DSCRATCH0 + check_illegal_insn_status(3,temp1++); + glb_expect_illegal_insn = 1; + __asm__ volatile("csrr %0, 0x7b3" : "=r"(temp)); // Debug DSCRATCH1 + check_illegal_insn_status(4,temp1++); + + printf("------------------------\n"); + printf(" Test2.2: check access to Trigger registers\n"); + + // NOTE: As of July 2024, CVA6 does not implement the trigger module. + // TODO: Consider adding trigger module support in future revisions. + + // Writes are ignored + temp = 0xFFFFFFFF; //TODO:MT should these be writes? + __asm__ volatile("csrw 0x7a0, %0" : "=r"(temp)); // Trigger TSELECT + __asm__ volatile("csrw 0x7a1, %0" : "=r"(temp)); // Trigger TDATA1 + __asm__ volatile("csrw 0x7a2, %0" : "=r"(temp)); // Trigger TDATA2 + __asm__ volatile("csrw 0x7a3, %0" : "=r"(temp)); // Trigger TDATA3 + __asm__ volatile("csrw 0x7a4, %0" : "=r"(temp)); // Trigger TINFO + + // Read default value + __asm__ volatile("csrr %0, 0x7a0" : "=r"(temp)); // Trigger TSELECT + // CVA6 if(temp != 0x0){printf("ERROR: TSELET Read\n");TEST_FAILED;} + + __asm__ volatile("csrr %0, 0x7a1" : "=r"(temp)); // Trigger TDATA1 + // CVA6 // 31:28 type = 2 + // CVA6 // 27 dmode = 1 + // CVA6 // 15:12 action = 1 + // CVA6 // 6 m(achine) = 1 + // CVA6 if(temp != (2<<28 | 1<<27 | 1<<12 | 1<<6)){printf("ERROR: TDATA1 Read\n");TEST_FAILED;} + + __asm__ volatile("csrr %0, 0x7a2" : "=r"(temp)); // Trigger TDATA2 + // CVA6 if(temp != 0x0){printf("ERROR: TDATA2 Read\n");TEST_FAILED;} + + __asm__ volatile("csrr %0, 0x7a3" : "=r"(temp)); // Trigger TDATA3 + // CVA6 if(temp != 0x0){printf("ERROR: TDATA3 Read\n");TEST_FAILED;} + + __asm__ volatile("csrr %0, 0x7a4" : "=r"(temp)); // Trigger TINFO + // CVA6 // tmatch = 1<<2 + // CVA6 if(temp != 1<<2){printf("ERROR: TINFO Read %d \n",temp);TEST_FAILED;} + + + // Do not expect or allow any more illegal instructions + + + mstatus_cmp = (mstatus_t) { + .fields.mpp = MACHINE // + }; + if(mstatus_cmp.bits != mstatus.bits) {printf("ERROR: init mstatus mismatch exp=%x val=%x\n", + mstatus_cmp.bits, mstatus.bits); TEST_FAILED;} + //TODO:MT are these switched up? + printf("------------------------\n"); + printf(" Test3.1: check hart ebreak executes ebreak handler but does not execute debugger code\n"); + glb_expect_ebreak_handler = 1; + asm volatile("c.ebreak"); + check_ebreak_status(32,1); + + printf("------------------------\n"); + printf(" Test3.2: check hart c.ebreak executes ebreak handler but does not execute debugger code\n"); + glb_expect_ebreak_handler = 1; + asm volatile(".4byte 0x00100073"); + check_ebreak_status(32,2); + + printf("------------------------\n"); + printf(" Test4: request hardware debugger\n"); + + debug_req_control = (debug_req_control_t) { + .fields.value = 1, + .fields.pulse_mode = 1, //PULSE Mode + .fields.rand_pulse_width = 0, + .fields.pulse_width = 5,// TODO:MT determine pulse width with non-sticky debug_req + .fields.rand_start_delay = 0, + .fields.start_delay = 200 + }; + glb_expect_debug_entry = 1; + DEBUG_REQ_CONTROL_REG = debug_req_control.bits; + + glb_hart_status = 4; // Basic test + while(glb_debug_status != glb_hart_status){ + printf("Wait for Debugger\n"); + } + check_debug_status(41,glb_hart_status); + + printf("------------------------\n"); + printf(" Test5: have debugger execute ebreak 3 more times\n"); + + glb_hart_status = 5; + glb_expect_debug_entry = 1; + DEBUG_REQ_CONTROL_REG = debug_req_control.bits; + while(glb_debug_status != (5+3)){ + printf("Wait for Debugger\n"); + } + check_debug_status(51,(5+3)); + + printf("------------------------\n"); + printf(" Test6: Test CSR access and default values in debug mode\n"); + glb_hart_status = 6; + glb_expect_debug_entry = 1; + DEBUG_REQ_CONTROL_REG = debug_req_control.bits; + while(glb_debug_status != glb_hart_status){ + printf("Wait for Debugger\n"); + } + check_debug_status(61,glb_hart_status); + + + printf("------------------------\n"); + printf(" Test10: check hart ebreak executes debugger code\n"); + glb_hart_status = 10; + glb_expect_debug_entry = 1; + asm volatile(".4byte 0x00100073"); + check_debug_status(33,glb_hart_status); + + printf("------------------------\n"); + printf(" Test11: check illegal csr exception during debug launches debugger exception and no csr modified\n"); + glb_hart_status = 11; + glb_expect_debug_entry = 1; + glb_expect_debug_exception = 1; + DEBUG_REQ_CONTROL_REG = debug_req_control.bits; + while(glb_debug_status != glb_hart_status){ + printf("Wait for Debugger\n"); + } + check_debug_status(111,glb_hart_status); + check_debug_exception_status(111,glb_hart_status); + //FIXME TBD BUG : need to update test to check actual csrs not modified. + + printf("------------------------\n"); + printf(" Test12: check ecall exception during debug launches debugger exception and no csr modified\n"); + glb_hart_status = 12; + glb_expect_debug_entry = 1; + glb_expect_debug_exception = 1; + DEBUG_REQ_CONTROL_REG = debug_req_control.bits; + while(glb_debug_status != glb_hart_status){ + printf("Wait for Debugger\n"); + } + check_debug_status(112,glb_hart_status); + check_debug_exception_status(112,glb_hart_status); + //FIXME TBD BUG : need to update test to check actual csrs not modified. + + printf("------------------------\n"); + printf(" Test13: check mret during debug launches debugger exception and no csr modified\n"); + glb_hart_status = 13; + glb_expect_debug_entry = 1; + glb_expect_debug_exception = 1; + DEBUG_REQ_CONTROL_REG = debug_req_control.bits; + while(glb_debug_status != glb_hart_status){ + printf("Wait for Debugger\n"); + } + check_debug_status(113,glb_hart_status); + check_debug_exception_status(113,glb_hart_status); + + printf("------------------------\n"); + printf(" Test14: Check exception ebreak enters debug mode\n"); + glb_hart_status = 14; + glb_expect_illegal_insn = 1; + glb_exception_ebreak_status = 1; + glb_expect_debug_entry = 1; + + // DCSR read will cause illegal instruction. + // Exception routine reads glb_exception_ebreak_status=1 and executes c.ebreak + __asm__ volatile("csrr %0, dcsr" : "=r"(temp)); // Debug DCSR + + while(glb_debug_status != glb_hart_status){ + printf("Wait for Debugger\n"); + } + + check_illegal_insn_status(114,temp1++); + check_debug_status(114, glb_hart_status); + printf("----------------------\n"); + printf("Test 16: dret in m-mode causes exception\n"); + + glb_expect_illegal_insn = 1; + __asm__ volatile("dret"); + check_illegal_insn_status(16, temp1++); + + printf("------------------------\n"); + printf("Test 17: WFI before debug_req_i and WFI in debug mode\n"); + printf("If test hangs, WFI is NOT converted to NOP\n"); + + glb_expect_debug_entry = 1; + glb_hart_status = 17; + // start_delay is set to 200, should get the wfi executing before dbg request is asserted + DEBUG_REQ_CONTROL_REG = debug_req_control.bits; + + // Execute WFI, when debug is asserted, it will act as NOP and enter debug mode + // If not, test will hang + __asm__ volatile("wfi"); + check_debug_status(117, glb_hart_status); + + printf("----------------------\n"); + printf("Checking interrupt, as this is needed by later tests\n"); + + // Assert and check irq, as this is needed by some tests. + mstatus_mie_enable(); + mie_enable(30); + glb_expect_irq_entry = 1; + mm_ram_assert_irq(0x40000000, 1); + while(glb_expect_irq_entry == 1); + mm_ram_assert_irq(0,0); + printf("Irq check done\n"); + + // Check that stoupcount bit (10) in dcsr has no affect + printf("-------------------------\n"); + printf("Test 21: Setting stopcount bit=1\n"); + glb_expect_debug_entry = 1; + glb_hart_status = 21; + + DEBUG_REQ_CONTROL_REG = debug_req_control.bits; + while(glb_debug_status != glb_hart_status){ + printf("Wait for Debugger\n"); + } + check_debug_status(121, glb_hart_status); + + + printf("------------------------\n"); + printf("Test 18: Single stepping\n"); + glb_hart_status = 18; + + // Run single step code (in single_step.S) + _single_step(0); + + // Single step code should generate 2 illegal insn + temp1++; + check_illegal_insn_status(118, temp1++); + check_debug_status(118, glb_hart_status); + + printf("Stepped %d times\n", glb_step_count); + + + printf("------------------------\n"); + printf("Test 19: irq in debug\n"); + glb_hart_status = 19; + glb_expect_debug_entry = 1; + + // Does not expect irq to be taken while in debug mode + // but it will be taken when we exit from debug. + // Timeout added in debug code to check for taken irq or not + glb_expect_irq_entry = 1; + DEBUG_REQ_CONTROL_REG=debug_req_control.bits; + + while(glb_debug_status != glb_hart_status){ + printf("Wait for Debugger\n"); + } + + check_debug_status(119, glb_hart_status); + if(glb_irq_timeout != 0) { + printf("glb_irq_timeout != 0, interrupt taken in debug.\n"); + TEST_FAILED; + } + + // Test debug req vs irq timing + printf("-----------------------\n"); + printf("Test 20: Asserting debug_req and irq at the same cycle\n"); + glb_expect_debug_entry = 1; + glb_expect_irq_entry = 1; + glb_hart_status = 20; + DEBUG_REQ_CONTROL_REG = debug_req_control.bits; + // 170 halts on first instuction in interrupt handler + // 175 gives same timing for interrupt and debug_req_i + mm_ram_assert_irq(0x40000000, 175+20); + + while(glb_debug_status != glb_hart_status){ + printf("Wait for Debugger\n"); + } + check_debug_status(120, glb_hart_status); + + + // Execute fence instruction in debug + printf("-----------------------------\n"); + printf("Test 22: Execute fence in debug mode\n"); + glb_expect_debug_entry = 1; + glb_hart_status = 22; + DEBUG_REQ_CONTROL_REG = debug_req_control.bits; + + while(glb_debug_status != glb_hart_status) { + printf("Wait for debugger\n"); + } + + check_debug_status(121, glb_hart_status); + + printf("------------------------\n"); + printf("Test 23: trigger match in debug mode with match disabled\n"); + glb_hart_status = 23; + glb_expect_debug_entry = 1; + + // Request debug + DEBUG_REQ_CONTROL_REG = debug_req_control.bits; + + while(glb_debug_status != glb_hart_status){ + printf("Wait for Debugger\n"); + } + + check_debug_status(123, glb_hart_status); + printf("------------------------\n"); + printf("Test 24: trigger register access in D-mode\n"); + glb_hart_status = 24; + glb_expect_debug_entry = 1; + + // Request debug + DEBUG_REQ_CONTROL_REG = debug_req_control.bits; + + while(glb_debug_status != glb_hart_status){ + printf("Wait for Debugger\n"); + } + + check_debug_status(124, glb_hart_status); + + //-------------------------------- + //return EXIT_FAILURE; + printf("------------------------\n"); + printf("Finished \n"); + return EXIT_SUCCESS; +} diff --git a/verif/tests/custom/debug_test/debugger.S b/verif/tests/custom/debug_test/debugger.S new file mode 100644 index 0000000000..d7ab3f045d --- /dev/null +++ b/verif/tests/custom/debug_test/debugger.S @@ -0,0 +1,642 @@ + +/* +** +** Copyright 2020 OpenHW Group +** +** Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); +** you may not use this file except in compliance with the License. +** You may obtain a copy of the License at +** +** https://solderpad.org/licenses/ +** +** Unless required by applicable law or agreed to in writing, software +** distributed under the License is distributed on an "AS IS" BASIS, +** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +** See the License for the specific language governing permissions and +** limitations under the License. +** +******************************************************************************* +** Debugger code +******************************************************************************* +*/ + +#include "corev_uvmt.h" + +.section .debugger, "ax" +.global _debugger_start +.global glb_debug_status +.global glb_hart_status +.global glb_expect_debug_entry +.global glb_step_info +.global glb_previous_dpc +.global glb_step_count +.global glb_irq_timeout +.global glb_mcycle_start +.global glb_mcycle_end +.global glb_minstret_start +.global glb_minstret_end +.global _step_trig_point +.global __debugger_stack_start +.global _debugger_fail +.global _debugger_end +.set timer_reg_addr, CV_VP_INTR_TIMER_BASE+0 +.set timer_val_addr, CV_VP_INTR_TIMER_BASE+4 +.set test_ret_val, CV_VP_STATUS_FLAGS_BASE +.set test_fail, 0x1 + +_debugger_start: + // Debugger Stack + csrw dscratch, a0 // dscratch0 + la a0, __debugger_stack_start + //sw t0, 0(a0) + csrw 0x7b3, t0 // dscratch1 + sw t1, 4(a0) + sw t2, 8(a0) + sw a1, 12(a0) + sw a2, 16(a0) + // Check if expecting debug entry + la a1, glb_expect_debug_entry + lw t1, 0(a1) + beq x0,t1,_debugger_fail + + // Read lower 32 bits of mcycle and minstret + // and store in globals for check at exit + csrr t1, mcycle + csrr t2, minstret + la a1, glb_mcycle_start + sw t1, 0(a1) + la a1, glb_minstret_start + sw t2, 0(a1) + + // Determine Test to execute in debugger code based on glb_hart_status + la a2, glb_hart_status + lw t2, 0(a2) + + // ebreak test will loop in debugger code over several iterations + // and will increment the global status each time + li t0,5 + beq t2,t0,_debugger_ebreak // Test 5 + + // For all other tests, + // Set debug status = hart status + la a1, glb_debug_status + sw t2, 0(a1) + + li t0, 4 + beq t2,t0,_debugger_simple // Test 4 + + li t0,6 + beq t2,t0,_debugger_csr // Test 6 + + li t0,10 + beq t2,t0,_debugger_ebreak_entry // Test 10 + + li t0,11 + beq t2,t0,_debugger_csr_exception // Test 11 + + li t0,12 + beq t2,t0,_debugger_ecall_exception // Test 12 + + li t0,13 + beq t2,t0,_debugger_mret_call // Test 13 + + li t0,14 + beq t2,t0, _debugger_ebreak_entry // Test 14 + + + li t0,17 + beq t2,t0, _debugger_wfi_test // Test 17 + + li t0,18 + beq t2, t0, _debugger_single_step + + li t0, 19 + beq t2, t0, _debugger_irq + + li t0, 20 + beq t2, t0, _debugger_req_and_irq + + li t0, 21 + beq t2, t0, _debugger_stopcount + + li t0, 22 + beq t2, t0, _debugger_fence + + li t0, 23 + beq t2, t0, _debugger_trigger_disabled_in_debug + + li t0, 24 + beq t2, t0, _debugger_trigger_regs_access + +_debugger_trigger_regs_access: + # R/W trigger regs otherwise not accessed + # to close coverage holes + li t0, 0xff + csrw 0x7a4, t0 # tinfo + csrr t0, 0x7a4 + li t1, 4 + bne t0, t1, _debugger_fail + + li t0, 0xff + csrw 0x7a3, t0 # tdata3 + csrr t0, 0x7a3 + bne t0, x0, _debugger_fail + + li t0, 0xff + csrw 0x7a0, t0 # tsel + csrr t0, 0x7a0 + bne t0, x0, _debugger_fail + + j _debugger_end + + +_debugger_fence: + fence + nop + nop + fence.i + nop + nop + j _debugger_end + +_debugger_req_and_irq: + // Debug was requested at the same cycle as irq + // Check dpc to see that pc is not at irq handler + // IRQ used was 30, so addr would be 30*4=120, 0x78 + csrr t0, dpc + li t1, 0x78 + beq t0, t1, _debugger_fail + j _debugger_end + +_debugger_stopcount: + li t0, 1<<10 + csrrs x0, dcsr, t0 + j _debugger_end +_debugger_irq: + // Assert irq + li a1, timer_reg_addr + li t0, 0x40000000 + sw t0, 0(a1) + li a1, timer_val_addr + li t0, 2 + sw t0, 0(a1) + + li t1, 1000 +// Wait for 1000 cycles, then timeout +_irq_wait_loop: + la a1, glb_expect_irq_entry + lw t0, 0(a1); + beq t1, x0, _irq_loop_end + addi t1, t1, -1 + bne t0, x0, _irq_wait_loop +_irq_loop_end: + la a1, glb_irq_timeout + sw t1, 0(a1) + j _debugger_end + +_debugger_single_step: + // Copy step_info + la a1, glb_step_info + lw t0, 0(a1) + + // Check action to take + li t1, 0 + beq t0, t1, _debugger_single_step_basic + li t1, 1 + beq t0, t1, _debugger_single_step_enable + li t1, 2 + beq t0, t1, _debugger_single_step_disable + li t1, 3 + //beq t0, t1, _debugger_single_step_illegal_insn + beq t0, t1, _debugger_single_step_basic + li t1, 4 + beq t0, t1, _debugger_single_step_trig_setup + li t1, 5 + beq t0, t1, _debugger_single_step_stepie_enable + li t1, 6 + beq t0, t1, _debugger_single_step_stepie_disable + li t1, 7 + beq t0, t1, _debugger_single_step_cebreak + li t1, 8 + beq t0, t1, _debugger_single_step_ebreak + li t1, 9 + beq t0, t1, _debugger_single_step_ebreak_exception + li t1, 10 + beq t0, t1, _debugger_single_step_cebreak_exception + j _debugger_fail + +_debugger_single_step_stepie_disable: + // enable stepi + li t0, 4<<28 | 1<<15 | 1<<2 | 0<<11 + csrw dcsr, t0 + + j _debugger_single_step_end +_debugger_single_step_stepie_enable: + // enable stepi + li t0, 4<<28 | 1<<15 | 1<<2 | 1<<11 + csrw dcsr, t0 + + j _debugger_single_step_end +_debugger_single_step_enable: + // Check dcsr + csrr t0, dcsr + li t1, 4<<28 | 1<<15 | 1<<6 | 3<<0 + bne t0, t1, _debugger_fail + + // Enable step bit in dcsr + li t0, 4<<28 | 1<<15 | 1<<2 + csrw dcsr, t0 + + // ebreak used to enter single step test, incr dpc + csrr t0, dpc + addi t0, t0, 2 + csrw dpc, t0 + j _debugger_single_step_end + +_debugger_single_step_disable: + // Turn off single stepping + li t0, 1<<15 + csrw dcsr, t0 + // Clear glb_expect_debug entry + // as this will not be done in + // _debugger_end for single step + la a1, glb_expect_debug_entry + sw x0, 0(a1) + j _debugger_end + +_debugger_single_step_trig_setup: + // Set trigger to match on _step_trig_point + la t0, _step_trig_point + csrw tdata2,t0 + li t1, 1<<2 + csrw tdata1,t1 + li t1, 2<<28 | 1<<27 | 1<<12 | 1<<6 | 1 <<2 + csrr t2,tdata1 + bne t1,t2,_debugger_fail + j _debugger_single_step_basic + +_debugger_single_step_end: + // Store dpc to variable for checking in next step + la a1, glb_previous_dpc + csrr t0, dpc + sw t0, 0(a1) + + // Increase step count + la a1, glb_step_count + lw t0, 0(a1) + addi t0, t0, 1 + sw t0, 0(a1) + + // Clear step info if not 3 (exception) or 5 (irq while stepping) + // 6, 7 or 8, 9, 10 + // In exception test we expect jumps + // to mtvec and other places, so keep + // step info to waive dpc checks + la a1, glb_step_info + lw t0, 0(a1) + li t1, 3 + beq t0, t1, _debugger_end + li t1, 5 + beq t0, t1, _debugger_end + li t1, 6 + beq t0, t1, _debugger_end + li t1, 7 + beq t0, t1, _debugger_end + li t1, 8 + beq t0, t1, _debugger_end + li t1, 9 + beq t0, t1, _debugger_end + li t1, 10 + beq t0, t1, _debugger_end + li t1, 0 + sw t1, 0(a1) + + // return to m-mode + j _debugger_end + +_debugger_single_step_illegal_insn: + // Check dcsr + // ebreakm step stepen + li t1, 4<<28 | 1<<15 | 4<<6 | 1<<2 | 3<<0 + csrr t2, dcsr + bne t1, t2, _debugger_fail + // read dpc and mtvec + //csrr t0, dpc + //csrr t1, mtvec + //andi t1, t1, 0xffffff00 + //bne t0, t1, _debugger_fail + j _debugger_single_step_end + +_debugger_step_trig_entry: + // Advance dpc to skip first match instruction + csrr t0, dpc + la a1, _step_trig_exit + csrw dpc, a1 + j _debugger_single_step_end + +_debugger_single_step_cebreak: + # If cause == 1, we need to advance dpc by 2 + li t1, 4<<28 | 1<<15 | 1<<6 | 1<<2 | 3<<0 + csrr t2, dcsr + beq t1, t2, _inc_dpc_cebreak + + j _debugger_single_step_end +_inc_dpc_cebreak: + csrr t1, dpc + addi t1, t1, 2 + csrw dpc, t1 + j _debugger_single_step_end + +_debugger_single_step_ebreak: + # If cause == 1, we need to advance dpc by 4 + li t1, 4<<28 | 1<<15 | 1<<6 | 1<<2 | 3<<0 + csrr t2, dcsr + beq t1, t2, _inc_dpc_ebreak + + j _debugger_single_step_end +_inc_dpc_ebreak: + csrr t1, dpc + addi t1, t1, 4 + csrw dpc, t1 + + # Turn off dcsr.ebreakm for next two tests + li t1, 4<<28 | 0<<15 | 1<<6 | 1<<2 | 3<<0 + csrw dcsr, t1 + j _debugger_single_step_end + +_debugger_single_step_ebreak_exception: + j _debugger_single_step_end + +_debugger_single_step_cebreak_exception: + # depc != 0 => we have passed the first + # instruction of the handler, and we can + # set dcsr.ebreakm again + csrr t0, dpc + bne t0, x0, _end + + li t1, 4<<28 | 1<<15 | 1<<6 | 1<<2 | 3<<0 + csrw dcsr, t1 + +_end: + j _debugger_single_step_end + +_debugger_single_step_basic: + // Check dcsr, jump to match-in-step if flagged in dcsr + li t1, 4<<28 | 1<<15 | 2<<6 | 1<<2 | 3<<0 + csrr t2, dcsr + beq t1, t2, _debugger_step_trig_entry + + + // Ensure tval (0x343) always == 0 + csrr t1, 0x343 + bne x0, t1, _debugger_fail +// ebreakm step stepen + li t1, 4<<28 | 1<<15 | 4<<6 | 1<<2 | 3<<0 + bne t1, t2, _debugger_fail + // Check that dpc increased by 2 or 4 + csrr t0, dpc + la a1, glb_previous_dpc + lw t1, 0(a1) + sub t0, t0, t1 + li t1, 2 + beq t0, t1, _debugger_single_step_end + li t1, 4 + beq t0, t1, _debugger_single_step_end + + // Waive dpc errors if we expect illegal instruction + la a1, glb_step_info + lw t0, 0(a1) + li t1, 3 +// bne t0, t1, _debugger_fail + j _debugger_single_step_end + +_debugger_csr_exception: + csrr t2,0xea8 // illegal insn + +_debugger_ecall_exception: + ecall // exception + +_debugger_mret_call: + mret // will invoke debugger exception routine + +_debugger_ebreak_entry: + la a1, glb_debug_status + li t1, 4<<28 | 1<<6 | 3<<0 | 1<<15 + csrr t2,dcsr + bne t1,t2,_debugger_fail + csrr a1,dpc + addi a1,a1,4 # uncompressed ebreak used to enter debug here + csrw dpc,a1 + //sw t1, 0(a1) + j _debugger_end + +_debugger_simple: + // Check cause 0x3, debugger + csrr t2,dcsr + li t1, 4<<28 | 3<<6 | 3<<0 + bne t1, t2, _debugger_fail + + //csrr t2,0xea8 // illegal insn + li t1, 1 + //sw t1, 0(a1) + j _debugger_end + +_debugger_csr: + // Check CSR access + // When done, set the ebreakm bit to allow next test to enter debug with ebreak + + // TBD BUG FIXME : make sure appropriate list of CSR (from sspecifications) + //csrr t2,mvendorid + //csrr t2,marchid + //csrr t2,mimpid + csrr t2,mhartid + + // machine trap setup + csrr t2,mstatus + csrr t2,misa + csrr t2,mie + csrr t2,mtvec + //FIXME csrr t2,mtval + + // machine trap handling + csrr t2,mscratch + csrr t2,mepc + csrr t2,mcause + csrr t2,mip + + // ----------------------- + // Debug CSRs + + // Expect DCSR + // 31:28 XDEBUGER Version = 4 + // 8:6 Cause = 3 debugger + // 1:0 Privelege = 3 Machine + // TBD FIXME BUG documentation update needed + li t1, 4<<28 | 3<<6 | 3<<0 + csrr t2,dcsr + bne t1,t2,_debugger_fail + csrr t2,dpc + beq x0,t2,_debugger_fail + //Already test this csrr t2,dscratch //dscratch0 + //Already test this csrr t2,0x7b3 //dscratch1 + + // Set ebreakm in dcsr + li t1, 4<<28 | 3<<6 | 3<<0| 1<<15 + csrw dcsr, t1 + + // ---------------------- + // Trigger CSRs + + // Expect TMATCH=TDATA1 + // 31:28 type = 2 + // 27 dmode = 1 + // 15:12 action = 1 + // 6 m(achine) = 1 + li t1, 2<<28 | 1<<27 | 1<<12 | 1<<6 + csrr t2,tdata1 + bne t1,t2,_debugger_fail + csrr t2,tselect + bne x0,t2,_debugger_fail + csrr t2,tdata2 + bne x0,t2,_debugger_fail + csrr t2,tdata3 + bne x0,t2,_debugger_fail + + j _debugger_end + +_debugger_ebreak: + li t0, 4<<28 | 3<<6 | 3<<0 + csrr t1, dcsr + bne t0, t1, _debugger_fail + // Increment glb_debug_status + la a1, glb_debug_status + lw t1, 0(a1) + addi t1,t1,1 + sw t1, 0(a1) + // Repeat executing debug code until debug status = hart_status + 3 + addi t0, t2, 3 + beq t1, t0, _debugger_end + // Execute non-compressed ebreak for iteration 2 + addi t0, t2, 2 + beq t1, t0, _uncompressed_ebreak + // Debugger Un-Stack and call debugger code from start using ebreak + csrr t0, 0x7b3 + lw t1, 4(a0) + lw t2, 8(a0) + lw a1, 12(a0) + lw a2, 16(a0) + csrr a0, dscratch + ebreak +_uncompressed_ebreak: + // Debugger Un-Stack and call debugger code from start using ebreak + csrr t0, 0x7b3 + lw t1, 4(a0) + lw t2, 8(a0) + lw a1, 12(a0) + lw a2, 16(a0) + csrr a0, dscratch + .4byte 0x00100073 # ebreak + +_debugger_trigger_in_debug: + // setup address to trigger on + la a1, _debugger_trig_point + csrw tdata2,a1 + li t1, 1<<2 + csrw tdata1,t1 + li t1, 2<<28 | 1<<27 | 1<<12 | 1<<6 | 1 <<2 + csrr t2,tdata1 + bne t1,t2,_debugger_fail + + // Clear glb_expect_debug_entry + // If we trig, we'll reenter debug and + // test will fail due to 0 flag + la a1, glb_expect_debug_entry + li t1, 0 + sw t1, 0(a1) +_debugger_trig_point: + // Should _not_trig here + nop + // Clear trigger + li t1, 0<<2 + csrw tdata1, t1 + j _debugger_end + +_debugger_trigger_disabled_in_debug: + // setup address to trigger on + la a1, _debugger_trig_point_dis + // Set trig enable to 0 + csrw tdata2,a1 + li t1, 0<<2 + csrw tdata1,t1 + li t1, 2<<28 | 1<<27 | 1<<12 | 1<<6 | 0 <<2 + csrr t2,tdata1 + bne t1,t2,_debugger_fail + + // Clear glb_expect_debug_entry + // If we trig, we'll reenter debug and + // test will fail due to 0 flag + la a1, glb_expect_debug_entry + li t1, 0 + sw t1, 0(a1) +_debugger_trig_point_dis: + // Should _not_trig here + nop + // Clear trigger + li t1, 0<<2 + csrw tdata1, t1 + j _debugger_end +_debugger_wfi_test: + la a1, glb_debug_status + csrr t2,dcsr + // ebreakm is set by previous test + li t1, 4<<28 | 3<<6 | 3<<0 | 1<<15 + bne t1, t2, _debugger_fail + + // If the following wfi is not converted + // to a nop, test will hang + wfi + j _debugger_end + +_debugger_end: + // Check counter values. + csrr t1, mcycle + la a1, glb_mcycle_start + lw t2, 0(a1) + sub t1, t1, t2 + beq t1, x0, _debugger_fail + + csrr t1, minstret + la a1, glb_minstret_start + lw t2, 0(a1) + sub t1, t1, t2 + beq t1, x0, _debugger_fail + + // If single stepping, do not clear + la a1, glb_hart_status + lw t0, 0(a1) + li t1, 18 + beq t0, t1, _debugger_end_continue + + // Clear debug entry expectation flag + la a1, glb_expect_debug_entry + sw x0, 0(a1) +_debugger_end_continue: + // Debugger Un-Stack + //lw t0, 0(a0) + la a0, __debugger_stack_start + csrr t0, 0x7b3 + lw t1, 4(a0) + lw t2, 8(a0) + lw a1, 12(a0) + lw a2, 16(a0) + csrr a0, dscratch + dret +_debugger_fail: //Test Failed + li a0, CV_VP_STATUS_FLAGS_BASE + li t0, test_fail + sw t0, 0(a0) + nop + nop + nop + nop + diff --git a/verif/tests/custom/debug_test/debugger_exception.S b/verif/tests/custom/debug_test/debugger_exception.S new file mode 100644 index 0000000000..d3d1e9dd4c --- /dev/null +++ b/verif/tests/custom/debug_test/debugger_exception.S @@ -0,0 +1,77 @@ + +/* +** +** Copyright 2020 OpenHW Group +** +** Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); +** you may not use this file except in compliance with the License. +** You may obtain a copy of the License at +** +** https://solderpad.org/licenses/ +** +** Unless required by applicable law or agreed to in writing, software +** distributed under the License is distributed on an "AS IS" BASIS, +** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +** See the License for the specific language governing permissions and +** limitations under the License. +** +******************************************************************************* +** Debugger Exception code +******************************************************************************* +*/ + +#include "corev_uvmt.h" + +.section .debugger_exception, "ax" +.global _debugger_exception_start +.global glb_debug_status +.global glb_hart_status +.global glb_debug_exception_status +.global glb_expect_debug_exception +//.global _debugger_fail +//.global _debugger_end +.set test_fail, 0x1 + +_debugger_exception_start: + // First check to see if exception was expected + la a1, glb_expect_debug_exception + lw t1, 0(a1) + //beq x0,t1,_debugger_fail + beq x0,t1,_debugger_exception_fail + + // Set exception status to hart status + la a1, glb_hart_status + lw t1, 0(a1) + la a2, glb_debug_exception_status + sw t1, 0(a2) + + //j _debugger_end + j _debugger_exception_end + +// Should be exact same function as implmented in debugger.S + // I can't seem to point to that symble from this file +_debugger_exception_end: + // Clear debug entry expectation flag + la a1, glb_expect_debug_entry + sw x0, 0(a1) + la a1, glb_expect_debug_exception + sw x0, 0(a1) + // Debugger Un-Stack + //lw t0, 0(a0) + csrr t0, 0x7b3 + lw t1, 4(a0) + lw t2, 8(a0) + lw a1, 12(a0) + lw a2, 16(a0) + csrr a0, dscratch + dret +// Should be exact same function as implmented in debugger.S +_debugger_exception_fail: + li a0, CV_VP_STATUS_FLAGS_BASE + li t0, test_fail + sw t0, 0(a0) + nop + nop + nop + nop + diff --git a/verif/tests/custom/debug_test/handlers.S b/verif/tests/custom/debug_test/handlers.S new file mode 100644 index 0000000000..0bdee311f8 --- /dev/null +++ b/verif/tests/custom/debug_test/handlers.S @@ -0,0 +1,349 @@ +/* +* Copyright 2019 ETH Zürich and University of Bologna +* +* Licensed under the Apache License, Version 2.0 (the "License"); +* you may not use this file except in compliance with the License. +* You may obtain a copy of the License at +* +* http://www.apache.org/licenses/LICENSE-2.0 +* +* Unless required by applicable law or agreed to in writing, software +* distributed under the License is distributed on an "AS IS" BASIS, +* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +* See the License for the specific language governing permissions and +* limitations under the License. +*/ + +#include "corev_uvmt.h" + +/* Exception codes */ +#define EXCEPTION_ILLEGAL_INSN 2 +#define EXCEPTION_BREAKPOINT 3 +#define EXCEPTION_ECALL_M 11 + +.section .text.handlers +.global __no_irq_handler +.global u_sw_irq_handler +.global m_software_irq_handler +.global m_timer_irq_handler +.global m_external_irq_handler +.global m_fast0_irq_handler +.global m_fast1_irq_handler +.global m_fast2_irq_handler +.global m_fast3_irq_handler +.global m_fast4_irq_handler +.global m_fast5_irq_handler +.global m_fast6_irq_handler +.global m_fast7_irq_handler +.global m_fast8_irq_handler +.global m_fast9_irq_handler +.global m_fast10_irq_handler +.global m_fast11_irq_handler +.global m_fast12_irq_handler +.global m_fast13_irq_handler +.global m_fast14_irq_handler +.global m_fast15_irq_handler + +.weak m_software_irq_handler +.weak m_timer_irq_handler +.weak m_external_irq_handler +.weak m_fast0_irq_handler +.weak m_fast1_irq_handler +.weak m_fast2_irq_handler +.weak m_fast3_irq_handler +.weak m_fast4_irq_handler +.weak m_fast5_irq_handler +.weak m_fast6_irq_handler +.weak m_fast7_irq_handler +.weak m_fast8_irq_handler +.weak m_fast9_irq_handler +.weak m_fast10_irq_handler +.weak m_fast11_irq_handler +.weak m_fast12_irq_handler +.weak m_fast13_irq_handler +.weak m_fast14_irq_handler +.weak m_fast15_irq_handler + +.global glb_illegal_insn_status +.global glb_ebreak_status +.global glb_expect_illegal_insn +.global glb_expect_ebreak_handler +.global glb_exception_ebreak_status +.global glb_expect_irq_entry +.set test_ret_val, CV_VP_STATUS_FLAGS_BASE +.set test_fail, 0x1 + +/* exception handling */ +__no_irq_handler: + addi sp,sp,-64 + sw ra, 0(sp) + sw a0, 4(sp) + sw a1, 8(sp) + sw a2, 12(sp) + sw a3, 16(sp) + sw a4, 20(sp) + sw a5, 24(sp) + sw a6, 28(sp) + sw a7, 32(sp) + sw t0, 36(sp) + sw t1, 40(sp) + sw t2, 44(sp) + sw t3, 48(sp) + sw t4, 52(sp) + sw t5, 56(sp) + sw t6, 60(sp) + + la a0, no_exception_handler_msg + jal ra, puts + + // Check if we expected to enter irq + la a1, glb_expect_irq_entry + lw t0, 0(a1) + beq t0, x0, _irq_fail + + // Clear entry flag + li t0, 0 + sw t0, 0(a1) + //j __no_irq_handler + + // Return + lw ra, 0(sp) + lw a0, 4(sp) + lw a1, 8(sp) + lw a2, 12(sp) + lw a3, 16(sp) + lw a4, 20(sp) + lw a5, 24(sp) + lw a6, 28(sp) + lw a7, 32(sp) + lw t0, 36(sp) + lw t1, 40(sp) + lw t2, 44(sp) + lw t3, 48(sp) + lw t4, 52(sp) + lw t5, 56(sp) + lw t6, 60(sp) + addi sp,sp,64 + mret + +_irq_fail: + li a0, CV_VP_STATUS_FLAGS_BASE + li t0, test_fail + sw t0, 0(a0) + ret + +u_sw_irq_handler: + /* While we are still using puts in handlers, save all caller saved + regs. Eventually, some of these saves could be deferred. */ + addi sp,sp,-64 + sw ra, 0(sp) + sw a0, 4(sp) + sw a1, 8(sp) + sw a2, 12(sp) + sw a3, 16(sp) + sw a4, 20(sp) + sw a5, 24(sp) + sw a6, 28(sp) + sw a7, 32(sp) + sw t0, 36(sp) + sw t1, 40(sp) + sw t2, 44(sp) + sw t3, 48(sp) + sw t4, 52(sp) + sw t5, 56(sp) + sw t6, 60(sp) + csrr t0, mcause + li t1, EXCEPTION_ILLEGAL_INSN + beq t0, t1, handle_illegal_insn + li t1, EXCEPTION_ECALL_M + beq t0, t1, handle_ecall + li t1, EXCEPTION_BREAKPOINT + beq t0, t1, handle_ebreak + j handle_unknown + +handle_ecall: + la a0, ecall_msg + jal ra, handle_syscall + j end_handler_incr_mepc + +m_software_irq_handler: + j __no_irq_handler + +m_timer_irq_handler: + j __no_irq_handler + +m_external_irq_handler: + j __no_irq_handler + +m_fast0_irq_handler: + j __no_irq_handler + +m_fast1_irq_handler: + j __no_irq_handler + +m_fast2_irq_handler: + j __no_irq_handler + +m_fast3_irq_handler: + j __no_irq_handler + +m_fast4_irq_handler: + j __no_irq_handler + +m_fast5_irq_handler: + j __no_irq_handler + +m_fast6_irq_handler: + j __no_irq_handler + +m_fast7_irq_handler: + j __no_irq_handler + +m_fast8_irq_handler: + j __no_irq_handler + +m_fast9_irq_handler: + j __no_irq_handler + +m_fast10_irq_handler: + j __no_irq_handler + +m_fast11_irq_handler: + j __no_irq_handler + +m_fast12_irq_handler: + j __no_irq_handler + +m_fast13_irq_handler: + j __no_irq_handler + +m_fast14_irq_handler: + j __no_irq_handler + +m_fast15_irq_handler: + j __no_irq_handler + + +handle_ebreak: + /* TODO support debug handling requirements. */ + la a0, ebreak_msg + jal ra, puts + // Check if expecting ebreak handler + la a0, glb_expect_ebreak_handler + lw t0, 0(a0) + bne t0, x0, cont_handle_ebreak + // Not expecting ebreak, assert test failed + li a0, CV_VP_STATUS_FLAGS_BASE + li t0, 1 + sw t0, 0(a0) + j end_handler_incr_mepc +cont_handle_ebreak: + //increment hart status + sw x0, 0(a0) + la a0, glb_ebreak_status + lw t0, 0(a0) + addi t0,t0,1 + sw t0, 0(a0) + j end_handler_incr_mepc + + + +handle_illegal_insn: + la a0, illegal_insn_msg + jal ra, puts + // Check if expecting illegal instruction + la a0, glb_expect_illegal_insn + lw t0, 0(a0) + bne t0, x0, cont_illegal_insn + li a0, CV_VP_STATUS_FLAGS_BASE + li t0, 1 + sw t0, 0(a0) //Test Failed + j end_handler_incr_mepc +cont_illegal_insn: + //increment hart status + sw x0, 0(a0) + la a0, glb_illegal_insn_status + lw t0, 0(a0) + addi t0,t0,1 + sw t0, 0(a0) + + // Check if we are expected to execute ebreak + la a0, glb_exception_ebreak_status + lw t0, 0(a0) + // End handler if no ebreak is to be executed + beq t0, x0, end_handler_incr_mepc + + // Clear ebreak flag + sw x0, 0(a0) + // Execute ebreak + .4byte 0x00100073 + // Exit handler + j end_handler_incr_mepc + + j end_handler_incr_mepc + + + + + + + +handle_unknown: + la a0, unknown_msg + jal ra, puts + /* We don't know what interrupt/exception is being handled, so don't + increment mepc. */ + j end_handler_ret + + + + + + +end_handler_incr_mepc: + csrr t0, mepc + lb t1, 0(t0) + li a0, 0x3 + and t1, t1, a0 + /* Increment mepc by 2 or 4 depending on whether the instruction at mepc + is compressed or not. */ + bne t1, a0, end_handler_incr_mepc2 + addi t0, t0, 2 +end_handler_incr_mepc2: + addi t0, t0, 2 + csrw mepc, t0 +end_handler_ret: + lw ra, 0(sp) + lw a0, 4(sp) + lw a1, 8(sp) + lw a2, 12(sp) + lw a3, 16(sp) + lw a4, 20(sp) + lw a5, 24(sp) + lw a6, 28(sp) + lw a7, 32(sp) + lw t0, 36(sp) + lw t1, 40(sp) + lw t2, 44(sp) + lw t3, 48(sp) + lw t4, 52(sp) + lw t5, 56(sp) + lw t6, 60(sp) + addi sp,sp,64 + mret +/* this interrupt can be generated for verification purposes, random or when the + PC is equal to a given value*/ +verification_irq_handler: + mret + +.section .rodata +illegal_insn_msg: + .string "illegal instruction exception handler entered\n" +ecall_msg: + .string "ecall exception handler entered\n" +ebreak_msg: + .string "ebreak exception handler entered\n" +unknown_msg: + .string "unknown exception handler entered\n" +no_exception_handler_msg: + .string "no exception handler installed\n" diff --git a/verif/tests/custom/debug_test/single_step.S b/verif/tests/custom/debug_test/single_step.S new file mode 100644 index 0000000000..4337b197fe --- /dev/null +++ b/verif/tests/custom/debug_test/single_step.S @@ -0,0 +1,233 @@ +#Copyright 202[x] Silicon Labs, Inc. + +#This file, and derivatives thereof are licensed under the +#Solderpad License, Version 2.0 (the "License"); +#Use of this file means you agree to the terms and conditions +#of the license and are in full compliance with the License. +#You may obtain a copy of the License at +# +# https://solderpad.org/licenses/SHL-2.0/ +# +#Unless required by applicable law or agreed to in writing, software +#and hardware implementations thereof +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESSED OR IMPLIED. +#See the License for the specific language governing permissions and +#limitations under the License. + +#include "corev_uvmt.h" + +.section .single_step_code_sect, "ax" +.set timer_reg_addr, CV_VP_INTR_TIMER_BASE+0 +.set timer_val_addr, CV_VP_INTR_TIMER_BASE+4 +.set test_ret_val, CV_VP_STATUS_FLAGS_BASE +.set test_fail, 0x1 + +.global glb_step_info +.global glb_expect_debug_entry +.global glb_expect_illegal_insn +.global glb_expect_irq_entry +.global _step_trig_point +.global _step_trig_exit +.global _single_step + + +_single_step: + addi sp,sp,-30 + sw t0, 0(sp) + sw t1, 4(sp) + sw a0, 8(sp) + sw a1, 12(sp) + sw a2, 16(sp) + sw ra, 20(sp) + + // Expect debug + la a1, glb_expect_debug_entry + li t0, 1 + sw t0, 0(a1) + + // Set step cause to 1 - enable single stepping + la a1, glb_step_info + li t0, 1 + sw t0, 0(a1) + + // Set t0 to 0 + li t0, 0 + + // Enter debug mode to execute cause=1 + c.ebreak + + // To check if debug code increments DPC correctly, + // Load up t0 in first instruction after ebreak + li t0, 1 + beq t0, x0, _single_step_fail + + // We are single stepping, WFI should complete as NOP + // Test will hang here if WFI is not converted properly + wfi + + // illegal instruction + la a1, glb_expect_illegal_insn + li t0, 1 + sw t0, 0(a1) + + la a1, glb_step_info + li t0, 3 + sw t0, 0(a1) + + csrr t0, dcsr // illegal + + la a1, glb_expect_illegal_insn + li t0, 1 + sw t0, 0(a1) + dret // illegal + + // Trigger match setup + la a1, glb_step_info + li t0, 4 + sw t0, 0(a1) + nop + nop + li t0, 0 + +_step_trig_point: + li t0, 1 // trig here + +_step_trig_exit: + addi t0, t0,2 // debug code moves dpc to here + li t1, 2 + // If trigger was correct, debug code skips + // loading of t0 to 1, and t0 should be of value 2 + bne t0, t1, _single_step_fail + + + //----------------- + // Stepping with interrupt, stepie=1 + la a1, glb_step_info + li t0, 5 + sw t0, 0(a1) + + // Expect irq flag + la a1, glb_expect_irq_entry + li t0, 1 + sw t0, 0(a1) + + // Assert irq + li a1, timer_reg_addr + li t0, 0x40000000 + sw t0, 0(a1) + li a1, timer_val_addr + li t0, 2 + sw t0, 0(a1) + +_irq_wait_loop: + la a1, glb_expect_irq_entry + lw t0, 0(a1); + bne t0, x0, _irq_wait_loop + + + //----------------- + // Stepping with interrupt, stepie=0 + la a1, glb_step_info + li t0, 6 + sw t0, 0(a1) + + // Assert irq + li a1, timer_reg_addr + li t0, 0x40000000 + sw t0, 0(a1) + li a1, timer_val_addr + li t0, 2 + sw t0, 0(a1) + + // Wait out some instructions to give IRQ a chance + // Report an ERROR if IRQ taken as we did not set glb_expect_irq_entry flag + nop + nop + nop + nop + + // De-Assert irq + li a1, timer_reg_addr + li t0, 0x00000000 + sw t0, 0(a1) + li a1, timer_val_addr + li t0, 1 + sw t0, 0(a1) + + nop + nop + + # set step reason to 7 (step with c.ebreak) + la a1, glb_step_info + li t0, 7 + sw t0, 0(a1) + + # Ebreak to cover ebreak vs step cause priority + c.ebreak + + # set step reason to 8 (step with ebreak) + la a1, glb_step_info + li t0, 8 + sw t0, 0(a1) + + # Ebreak to cover ebreak vs step cause priority + .4byte 0x00100073 + + # Set step reason to 9, ebreak without dcsr.ebreakm + la a1, glb_step_info + li t0, 9 + sw t0, 0(a1) + + # Expect to enter ebreak handler + la a0, glb_expect_ebreak_handler + li t0, 1 + sw t0, 0(a0) + + .4byte 0x00100073 + + # Expect to enter ebreak handler + la a0, glb_expect_ebreak_handler + li t0, 1 + sw t0, 0(a0) + # Set step reason to 10, cebreak without dcsr.ebreakm + la a1, glb_step_info + li t0, 10 + sw t0, 0(a1) + + c.ebreak + + # set step reason to 0, normal step + la a1, glb_step_info + li t0, 0 + sw t0, 0(a1) + + ecall + // Cause 2, disable single stepping + la a1, glb_step_info + li t0, 2 + sw t0, 0(a1) + nop + nop + j _single_step_done + +_single_step_fail: + li a0, CV_VP_STATUS_FLAGS_BASE + li t0, test_fail + sw t0, 0(a0) + // Turn off single step + la a1, glb_step_info + li t0, 2 + sw t0, 0(a1) + + j _single_step_done + +_single_step_done: + lw t0, 0(sp) + lw t1, 4(sp) + lw a0, 8(sp) + lw a1, 12(sp) + lw a2, 16(sp) + lw ra, 20(sp) + addi sp,sp,30 + ret diff --git a/verif/tests/custom/debug_test/test.yaml b/verif/tests/custom/debug_test/test.yaml new file mode 100644 index 0000000000..3f975a0737 --- /dev/null +++ b/verif/tests/custom/debug_test/test.yaml @@ -0,0 +1,11 @@ +# Test definition YAML for test + +# Debug directed test +name: debug_test +uvm_test: uvmt_$(CV_CORE_LC)_firmware_test_c +program: debug_test +description: > + Debug directed test +# FIXME:The minstret compare issues with this test should be filed as bug and fixed +disable_csr_check: + - minstret diff --git a/verif/tests/custom/debug_test/trigger_code.S b/verif/tests/custom/debug_test/trigger_code.S new file mode 100644 index 0000000000..f66a54ca2c --- /dev/null +++ b/verif/tests/custom/debug_test/trigger_code.S @@ -0,0 +1,160 @@ +#Copyright 202[x] Silicon Labs, Inc. +# +#This file, and derivatives thereof are licensed under the +#Solderpad License, Version 2.0 (the "License"); +#Use of this file means you agree to the terms and conditions +#of the license and are in full compliance with the License. +#You may obtain a copy of the License at +# +# https://solderpad.org/licenses/SHL-2.0/ +# +#Unless required by applicable law or agreed to in writing, software +#and hardware implementations thereof +#distributed under the License is distributed on an "AS IS" BASIS, +#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, EITHER EXPRESSED OR IMPLIED. +#See the License for the specific language governing permissions and +#limitations under the License. + +#include "corev_uvmt.h" + +.section .trigger_code_sect, "ax" +.set test_ret_val, CV_VP_STATUS_FLAGS_BASE +.set test_fail, 0x1 + +.global _trigger_exit +.global _trigger_test +.global _trigger_code +.global _trigger_test_ebreak +.global _trigger_code_ebreak +.global _trigger_code_illegal_insn +.global _trigger_code_branch_insn +.global _trigger_code_multicycle_insn +.global _trigger_code_cebreak +.type _trigger_code, @function +.type _trigger_code_ebreak, @function +.type _trigger_code_cebreak, @function +.type _trigger_code_illegal_insn, @function +.type _trigger_code_branch_insn, @function +.type _trigger_code_multicycle_insn, @function + + +_trigger_code_ebreak: + .4byte 0x00100073 + ret + +_trigger_code_cebreak: + c.ebreak + ret +_trigger_code_illegal_insn: + dret + ret +_trigger_code_branch_insn: + beq t0, t1, __trigger_fail + ret +_trigger_code_multicycle_insn: + mulhsu t0, t0, t1 + ret +_trigger_test_ebreak: + addi sp,sp,-30 + sw t0, 0(sp) + sw t1, 4(sp) + sw a0, 8(sp) + sw a1, 12(sp) + sw a2, 16(sp) + sw ra, 20(sp) + + # a0 holds argument + # 0 - ebreak + # 1 - c.c.ebreak + # 2 - illegal instruction + # 3 - branch instruction + # 4 - multicycle instruction (mulhsu) + + mv t1, a0 + li t0, 0 + beq t0, t1, _jmp_ebreak + + li t0, 1 + beq t0, t1, _jmp_cebreak + + li t0, 2 + beq t0, t1, _jmp_illegal_insn + + li t0, 3 + beq t0, t1, _jmp_branch_insn + + li t0, 4 + beq t0, t1, _jmp_multicycle_insn + +_jmp_ebreak: + jal ra, _trigger_code_ebreak + j __trigger_done +_jmp_cebreak: + jal ra, _trigger_code_cebreak + j __trigger_done +_jmp_illegal_insn: + jal ra, _trigger_code_illegal_insn + j __trigger_done +_jmp_branch_insn: + jal ra, _trigger_code_branch_insn + j __trigger_done +_jmp_multicycle_insn: + jal ra, _trigger_code_multicycle_insn + j __trigger_done + +# j __trigger_done + + + // We will trigger on the _trigger_code addess + // We should not expect the first instruction to execute + // The debugger code will move the PC to the trigger_exit_code + // Which essentially avoid executing all of the code in the trigger_code +_trigger_code: + add a2,a0,a1 + ret +_trigger_exit: + ret +_trigger_test: + addi sp,sp,-30 + sw t0, 0(sp) + sw t1, 4(sp) + sw a0, 8(sp) + sw a1, 12(sp) + sw a2, 16(sp) + sw ra, 20(sp) + + // a0 holds input to function (expect trigger) + mv t1, a0 + + // Load up some random data to add + li a0, 7893 + li a1, 1452 + li a2, 191 // a2 value will be overwrriten by _trigger_code + mv t2, a2 // keep a copy of the value to compare against + + // Call function that will have a trigger match + // If no trigger match, then a2=a0+a1 + // Else if trigger matched, then a2 is not modified + jal ra, _trigger_code + + // if (expect trigger) check against original value (in t2) + bne t1 ,x0, __trigger_check + // else + // trigger match not expected, function executes as normal + // set execpted value to t2 = a0 + a1 + add t2, a0, a1 +__trigger_check: + beq t2,a2,__trigger_done +__trigger_fail: + li a0, CV_VP_STATUS_FLAGS_BASE + li t0, 1 + sw t0, 0(a0) +__trigger_done: + lw t0, 0(sp) + lw t1, 4(sp) + lw a0, 8(sp) + lw a1, 12(sp) + lw a2, 16(sp) + lw ra, 20(sp) + addi sp,sp,30 + ret diff --git a/verif/tests/custom/dhrystone/dhrystone.c b/verif/tests/custom/dhrystone/dhrystone.c new file mode 100644 index 0000000000..38e33764f2 --- /dev/null +++ b/verif/tests/custom/dhrystone/dhrystone.c @@ -0,0 +1,185 @@ +// See LICENSE for license details. + +#pragma GCC optimize ("no-inline") + +#include "dhrystone.h" + +#ifndef REG +#define REG + /* REG becomes defined as empty */ + /* i.e. no register variables */ +#else +#undef REG +#define REG register +#endif + +extern int Int_Glob; +extern char Ch_1_Glob; + + +Proc_6 (Enum_Val_Par, Enum_Ref_Par) +/*********************************/ + /* executed once */ + /* Enum_Val_Par == Ident_3, Enum_Ref_Par becomes Ident_2 */ + +Enumeration Enum_Val_Par; +Enumeration *Enum_Ref_Par; +{ + *Enum_Ref_Par = Enum_Val_Par; + if (! Func_3 (Enum_Val_Par)) + /* then, not executed */ + *Enum_Ref_Par = Ident_4; + switch (Enum_Val_Par) + { + case Ident_1: + *Enum_Ref_Par = Ident_1; + break; + case Ident_2: + if (Int_Glob > 100) + /* then */ + *Enum_Ref_Par = Ident_1; + else *Enum_Ref_Par = Ident_4; + break; + case Ident_3: /* executed */ + *Enum_Ref_Par = Ident_2; + break; + case Ident_4: break; + case Ident_5: + *Enum_Ref_Par = Ident_3; + break; + } /* switch */ +} /* Proc_6 */ + + +Proc_7 (Int_1_Par_Val, Int_2_Par_Val, Int_Par_Ref) +/**********************************************/ + /* executed three times */ + /* first call: Int_1_Par_Val == 2, Int_2_Par_Val == 3, */ + /* Int_Par_Ref becomes 7 */ + /* second call: Int_1_Par_Val == 10, Int_2_Par_Val == 5, */ + /* Int_Par_Ref becomes 17 */ + /* third call: Int_1_Par_Val == 6, Int_2_Par_Val == 10, */ + /* Int_Par_Ref becomes 18 */ +One_Fifty Int_1_Par_Val; +One_Fifty Int_2_Par_Val; +One_Fifty *Int_Par_Ref; +{ + One_Fifty Int_Loc; + + Int_Loc = Int_1_Par_Val + 2; + *Int_Par_Ref = Int_2_Par_Val + Int_Loc; +} /* Proc_7 */ + + +Proc_8 (Arr_1_Par_Ref, Arr_2_Par_Ref, Int_1_Par_Val, Int_2_Par_Val) +/*********************************************************************/ + /* executed once */ + /* Int_Par_Val_1 == 3 */ + /* Int_Par_Val_2 == 7 */ +Arr_1_Dim Arr_1_Par_Ref; +Arr_2_Dim Arr_2_Par_Ref; +int Int_1_Par_Val; +int Int_2_Par_Val; +{ + REG One_Fifty Int_Index; + REG One_Fifty Int_Loc; + + Int_Loc = Int_1_Par_Val + 5; + Arr_1_Par_Ref [Int_Loc] = Int_2_Par_Val; + Arr_1_Par_Ref [Int_Loc+1] = Arr_1_Par_Ref [Int_Loc]; + Arr_1_Par_Ref [Int_Loc+30] = Int_Loc; + for (Int_Index = Int_Loc; Int_Index <= Int_Loc+1; ++Int_Index) + Arr_2_Par_Ref [Int_Loc] [Int_Index] = Int_Loc; + Arr_2_Par_Ref [Int_Loc] [Int_Loc-1] += 1; + Arr_2_Par_Ref [Int_Loc+20] [Int_Loc] = Arr_1_Par_Ref [Int_Loc]; + Int_Glob = 5; +} /* Proc_8 */ + + +Enumeration Func_1 (Ch_1_Par_Val, Ch_2_Par_Val) +/*************************************************/ + /* executed three times */ + /* first call: Ch_1_Par_Val == 'H', Ch_2_Par_Val == 'R' */ + /* second call: Ch_1_Par_Val == 'A', Ch_2_Par_Val == 'C' */ + /* third call: Ch_1_Par_Val == 'B', Ch_2_Par_Val == 'C' */ + +Capital_Letter Ch_1_Par_Val; +Capital_Letter Ch_2_Par_Val; +{ + Capital_Letter Ch_1_Loc; + Capital_Letter Ch_2_Loc; + + Ch_1_Loc = Ch_1_Par_Val; + Ch_2_Loc = Ch_1_Loc; + if (Ch_2_Loc != Ch_2_Par_Val) + /* then, executed */ + return (Ident_1); + else /* not executed */ + { + Ch_1_Glob = Ch_1_Loc; + return (Ident_2); + } +} /* Func_1 */ + + +Boolean Func_2 (Str_1_Par_Ref, Str_2_Par_Ref) +/*************************************************/ + /* executed once */ + /* Str_1_Par_Ref == "DHRYSTONE PROGRAM, 1'ST STRING" */ + /* Str_2_Par_Ref == "DHRYSTONE PROGRAM, 2'ND STRING" */ + +Str_30 Str_1_Par_Ref; +Str_30 Str_2_Par_Ref; +{ + REG One_Thirty Int_Loc; + Capital_Letter Ch_Loc; + + Int_Loc = 2; + while (Int_Loc <= 2) /* loop body executed once */ + if (Func_1 (Str_1_Par_Ref[Int_Loc], + Str_2_Par_Ref[Int_Loc+1]) == Ident_1) + /* then, executed */ + { + Ch_Loc = 'A'; + Int_Loc += 1; + } /* if, while */ + if (Ch_Loc >= 'W' && Ch_Loc < 'Z') + /* then, not executed */ + Int_Loc = 7; + if (Ch_Loc == 'R') + /* then, not executed */ + return (true); + else /* executed */ + { + if (strcmp (Str_1_Par_Ref, Str_2_Par_Ref) > 0) + /* then, not executed */ + { + Int_Loc += 7; + Int_Glob = Int_Loc; + return (true); + } + else /* executed */ + return (false); + } /* if Ch_Loc */ +} /* Func_2 */ + + +Boolean Func_3 (Enum_Par_Val) +/***************************/ + /* executed once */ + /* Enum_Par_Val == Ident_3 */ +Enumeration Enum_Par_Val; +{ + Enumeration Enum_Loc; + + Enum_Loc = Enum_Par_Val; + if (Enum_Loc == Ident_3) + /* then, executed */ + return (true); + else /* not executed */ + return (false); +} /* Func_3 */ + +void debug_printf(const char* str, ...) +{ +} diff --git a/verif/tests/custom/dhrystone/dhrystone.h b/verif/tests/custom/dhrystone/dhrystone.h new file mode 100644 index 0000000000..9c85849f00 --- /dev/null +++ b/verif/tests/custom/dhrystone/dhrystone.h @@ -0,0 +1,477 @@ +// See LICENSE for license details. + +#ifndef _DHRYSTONE_H +#define _DHRYSTONE_H + +/****************** "DHRYSTONE" Benchmark Program ***************************/ +#define Version "C, Version 2.2" +/* File: dhry_1.c (part 2 of 3) + * Author: Reinhold P. Weicker + * Siemens Nixdorf, Paderborn/Germany + * weicker@specbench.org + * Date: May 25, 1988 + * Modified: Steven Pemberton, CWI, Amsterdam; Steven.Pemberton@cwi.nl + * Date: October, 1993; March 1995 + * Included both files into one source, that gets compiled + * in two passes. Made program auto-compiling, and auto-running, + * and generally made it much easier to use. + * + * Original Version (in Ada) published in + * "Communications of the ACM" vol. 27., no. 10 (Oct. 1984), + * pp. 1013 - 1030, together with the statistics + * on which the distribution of statements etc. is based. + * + * In this C version, the following C library functions are used: + * - strcpy, strcmp (inside the measurement loop) + * - printf, scanf (outside the measurement loop) + * In addition, Berkeley UNIX system calls "times ()" or "time ()" + * are used for execution time measurement. For measurements + * on other systems, these calls have to be changed. + * + * Collection of Results: + * Reinhold Weicker (address see above) and + * + * Rick Richardson + * PC Research. Inc. + * 94 Apple Orchard Drive + * Tinton Falls, NJ 07724 + * Phone: (201) 389-8963 (9-17 EST) + * Usenet: ...!uunet!pcrat!rick + * + * Please send results to Rick Richardson and/or Reinhold Weicker. + * Complete information should be given on hardware and software used. + * Hardware information includes: Machine type, CPU, type and size + * of caches; for microprocessors: clock frequency, memory speed + * (number of wait states). + * Software information includes: Compiler (and runtime library) + * manufacturer and version, compilation switches, OS version. + * The Operating System version may give an indication about the compiler; + * Dhrystone itself performs no OS calls in the measurement loop. + * + * The complete output generated by the program should be mailed + * such that at least some checks for correctness can be made. + * + *************************************************************************** + * + * Defines: The following "Defines" are possible: + * -DREG (default: Not defined) + * As an approximation to what an average C programmer + * might do, causes the "register" storage class to be applied + * - for local variables, if they are used (dynamically) + * five or more times + * - for parameters if they are used (dynamically) + * six or more times + * Note that an optimal "register" strategy is + * compiler-dependent, and that "register" declarations + * do not necessarily lead to faster execution. + * -DNOSTRUCTASSIGN (default: Not defined) + * Define if the C compiler does not support + * assignment of structures. + * -DNOENUMS (default: Not defined) + * Define if the C compiler does not support + * enumeration types. + * -DTIMES (default) + * -DTIME + * The "times" function of UNIX (returning process times) + * or the "time" function (returning wallclock time) + * is used for measurement. + * For single user machines, "time ()" is adequate. For + * multi-user machines where you cannot get single-user + * access, use the "times ()" function. If you have + * neither, use a stopwatch in the dead of night. + * "printf"s are provided marking the points "Start Timer" + * and "Stop Timer". DO NOT use the UNIX "time(1)" + * command, as this will measure the total time to + * run this program, which will (erroneously) include + * the time to allocate storage (malloc) and to perform + * the initialization. + * -DHZ=nnn + * In Berkeley UNIX, the function "times" returns process + * time in 1/HZ seconds, with HZ = 60 for most systems. + * CHECK YOUR SYSTEM DESCRIPTION BEFORE YOU JUST APPLY + * A VALUE. + * + *************************************************************************** + * + * History: Version C/2.1 was made for two reasons: + * + * 1) There was an obvious need for a common C version of + * Dhrystone, since C is at present the most popular system + * programming language for the class of processors + * (microcomputers, minicomputers) where Dhrystone is used most. + * There should be, as far as possible, only one C version of + * Dhrystone such that results can be compared without + * restrictions. In the past, the C versions distributed + * by Rick Richardson (Version 1.1) and by Reinhold Weicker + * had small (though not significant) differences. + * + * 2) As far as it is possible without changes to the Dhrystone + * statistics, optimizing compilers should be prevented from + * removing significant statements. + * + * This C version has been developed in cooperation with + * Rick Richardson (Tinton Falls, NJ), it incorporates many + * ideas from the "Version 1.1" distributed previously by + * him over the UNIX network Usenet. + * I also thank Chaim Benedelac (National Semiconductor), + * David Ditzel (SUN), Earl Killian and John Mashey (MIPS), + * Alan Smith and Rafael Saavedra-Barrera (UC at Berkeley) + * for their help with comments on earlier versions of the + * benchmark. + * + * Changes: In the initialization part, this version follows mostly + * Rick Richardson's version distributed via Usenet, not the + * version distributed earlier via floppy disk by Reinhold Weicker. + * As a concession to older compilers, names have been made + * unique within the first 8 characters. + * Inside the measurement loop, this version follows the + * version previously distributed by Reinhold Weicker. + * + * At several places in the benchmark, code has been added, + * but within the measurement loop only in branches that + * are not executed. The intention is that optimizing compilers + * should be prevented from moving code out of the measurement + * loop, or from removing code altogether. Since the statements + * that are executed within the measurement loop have NOT been + * changed, the numbers defining the "Dhrystone distribution" + * (distribution of statements, operand types and locality) + * still hold. Except for sophisticated optimizing compilers, + * execution times for this version should be the same as + * for previous versions. + * + * Since it has proven difficult to subtract the time for the + * measurement loop overhead in a correct way, the loop check + * has been made a part of the benchmark. This does have + * an impact - though a very minor one - on the distribution + * statistics which have been updated for this version. + * + * All changes within the measurement loop are described + * and discussed in the companion paper "Rationale for + * Dhrystone version 2". + * + * Because of the self-imposed limitation that the order and + * distribution of the executed statements should not be + * changed, there are still cases where optimizing compilers + * may not generate code for some statements. To a certain + * degree, this is unavoidable for small synthetic benchmarks. + * Users of the benchmark are advised to check code listings + * whether code is generated for all statements of Dhrystone. + * + * Version 2.1 is identical to version 2.0 distributed via + * the UNIX network Usenet in March 1988 except that it corrects + * some minor deficiencies that were found by users of version 2.0. + * The only change within the measurement loop is that a + * non-executed "else" part was added to the "if" statement in + * Func_3, and a non-executed "else" part removed from Proc_3. + * + * Version C/2.2, Steven Pemberton, October 1993 + * Functionally, identical to version 2.2; the changes are in + * how you compile and use it: + * - Everything is in one file now, but compiled in 2 passes + * - Compile (and run) by running the file through the shell: 'sh dhry.c" + * - Uses the system definition of HZ if one can be found + * - HZ must be defined, otherwise it won't compile (no defaults here) + * - The (uninteresting) output is printed to stderr (dhry2 > /dev/null) + * - The number of loops is passed as a parameter, rather than read + * (dhry2 500000) + * - If the number of loops is insufficient to get a good result, + * it repeats it with loops*10 until it is enough (rather than just + * stopping) + * - Output says which sort of clock it is using, and the HZ value + * - You can use -DREG instead of the -DREG=register of previous versions + * - Some stylistic cleanups. + * + *************************************************************************** + * + * Compilation model and measurement (IMPORTANT): + * + * The following "ground rules" apply for measurements: + * - Separate compilation + * - No procedure merging + * - Otherwise, compiler optimizations are allowed but should be indicated + * - Default results are those without register declarations + * See the companion paper "Rationale for Dhrystone Version 2" for a more + * detailed discussion of these ground rules. + * + * For 16-Bit processors (e.g. 80186, 80286), times for all compilation + * models ("small", "medium", "large" etc.) should be given if possible, + * together with a definition of these models for the compiler system used. + * + ************************************************************************** + * + * Dhrystone (C version) statistics: + * + * [Comment from the first distribution, updated for version 2. + * Note that because of language differences, the numbers are slightly + * different from the Ada version.] + * + * The following program contains statements of a high level programming + * language (here: C) in a distribution considered representative: + * + * assignments 52 (51.0 %) + * control statements 33 (32.4 %) + * procedure, function calls 17 (16.7 %) + * + * 103 statements are dynamically executed. The program is balanced with + * respect to the three aspects: + * + * - statement type + * - operand type + * - operand locality + * operand global, local, parameter, or constant. + * + * The combination of these three aspects is balanced only approximately. + * + * 1. Statement Type: + * ----------------- number + * + * V1 = V2 9 + * (incl. V1 = F(..) + * V = Constant 12 + * Assignment, 7 + * with array element + * Assignment, 6 + * with record component + * -- + * 34 34 + * + * X = Y +|-|"&&"|"|" Z 5 + * X = Y +|-|"==" Constant 6 + * X = X +|- 1 3 + * X = Y *|/ Z 2 + * X = Expression, 1 + * two operators + * X = Expression, 1 + * three operators + * -- + * 18 18 + * + * if .... 14 + * with "else" 7 + * without "else" 7 + * executed 3 + * not executed 4 + * for ... 7 | counted every time + * while ... 4 | the loop condition + * do ... while 1 | is evaluated + * switch ... 1 + * break 1 + * declaration with 1 + * initialization + * -- + * 34 34 + * + * P (...) procedure call 11 + * user procedure 10 + * library procedure 1 + * X = F (...) + * function call 6 + * user function 5 + * library function 1 + * -- + * 17 17 + * --- + * 103 + * + * The average number of parameters in procedure or function calls + * is 1.82 (not counting the function values aX * + * + * 2. Operators + * ------------ + * number approximate + * percentage + * + * Arithmetic 32 50.8 + * + * + 21 33.3 + * - 7 11.1 + * * 3 4.8 + * / (int div) 1 1.6 + * + * Comparison 27 42.8 + * + * == 9 14.3 + * /= 4 6.3 + * > 1 1.6 + * < 3 4.8 + * >= 1 1.6 + * <= 9 14.3 + * + * Logic 4 6.3 + * + * && (AND-THEN) 1 1.6 + * | (OR) 1 1.6 + * ! (NOT) 2 3.2 + * + * -- ----- + * 63 100.1 + * + * + * 3. Operand Type (counted once per operand reference): + * --------------- + * number approximate + * percentage + * + * Integer 175 72.3 % + * Character 45 18.6 % + * Pointer 12 5.0 % + * String30 6 2.5 % + * Array 2 0.8 % + * Record 2 0.8 % + * --- ------- + * 242 100.0 % + * + * When there is an access path leading to the final operand (e.g. a record + * component), only the final data type on the access path is counted. + * + * + * 4. Operand Locality: + * ------------------- + * number approximate + * percentage + * + * local variable 114 47.1 % + * global variable 22 9.1 % + * parameter 45 18.6 % + * value 23 9.5 % + * reference 22 9.1 % + * function result 6 2.5 % + * constant 55 22.7 % + * --- ------- + * 242 100.0 % + * + * The program does not compute anything meaningful, but it is syntactically + * and semantically correct. All variables have a value assigned to them + * before they are used as a source operand. + * + * There has been no explicit effort to account for the effects of a + * cache, or to balance the use of long or short displacements for code or + * data. + * + *************************************************************************** + */ + +/* Compiler and system dependent definitions: */ + +/* variables for time measurement: */ + +#ifdef TIME + +#define CLOCK_TYPE "time()" +#undef HZ +#define HZ (1) /* time() returns time in seconds */ +extern long time(); /* see library function "time" */ +#define Too_Small_Time 2 /* Measurements should last at least 2 seconds */ +#define Start_Timer() Begin_Time = time ( (long *) 0) +#define Stop_Timer() End_Time = time ( (long *) 0) + +#else + +#ifdef MSC_CLOCK /* Use Microsoft C hi-res clock */ + +#undef HZ +#undef TIMES +#include +#define HZ CLK_TCK +#define CLOCK_TYPE "MSC clock()" +extern clock_t clock(); +#define Too_Small_Time (2*HZ) +#define Start_Timer() Begin_Time = clock() +#define Stop_Timer() End_Time = clock() + +#elif defined(__riscv) + +#define HZ 1000000 +#define Too_Small_Time 1 +#define CLOCK_TYPE "rdcycle()" +#define Start_Timer() Begin_Time = read_csr(mcycle) +#define Stop_Timer() End_Time = read_csr(mcycle) + +#else + /* Use times(2) time function unless */ + /* explicitly defined otherwise */ +#define CLOCK_TYPE "times()" +#include +#include +#ifndef HZ /* Added by SP 900619 */ +#include /* If your system doesn't have this, use -DHZ=xxx */ +#else + *** You must define HZ!!! *** +#endif /* HZ */ +#ifndef PASS2 +struct tms time_info; +#endif +/*extern int times ();*/ + /* see library function "times" */ +#define Too_Small_Time (2*HZ) + /* Measurements should last at least about 2 seconds */ +#define Start_Timer() times(&time_info); Begin_Time=(long)time_info.tms_utime +#define Stop_Timer() times(&time_info); End_Time = (long)time_info.tms_utime + +#endif /* MSC_CLOCK */ +#endif /* TIME */ + + +#define Mic_secs_Per_Second 1000000 +#define NUMBER_OF_RUNS 50 /* Default number of runs */ + +#ifdef NOSTRUCTASSIGN +#define structassign(d, s) memcpy(&(d), &(s), sizeof(d)) +#else +#define structassign(d, s) d = s +#endif + +#ifdef NOENUM +#define Ident_1 0 +#define Ident_2 1 +#define Ident_3 2 +#define Ident_4 3 +#define Ident_5 4 + typedef int Enumeration; +#else + typedef enum {Ident_1, Ident_2, Ident_3, Ident_4, Ident_5} + Enumeration; +#endif + /* for boolean and enumeration types in Ada, Pascal */ + +/* General definitions: */ + +#include +#include + /* for strcpy, strcmp */ + +#define Null 0 + /* Value of a Null pointer */ +#define true 1 +#define false 0 + +typedef int One_Thirty; +typedef int One_Fifty; +typedef char Capital_Letter; +typedef int Boolean; +typedef char Str_30 [31]; +typedef int Arr_1_Dim [50]; +typedef int Arr_2_Dim [50] [50]; + +typedef struct record + { + struct record *Ptr_Comp; + Enumeration Discr; + union { + struct { + Enumeration Enum_Comp; + int Int_Comp; + char Str_Comp [31]; + } var_1; + struct { + Enumeration E_Comp_2; + char Str_2_Comp [31]; + } var_2; + struct { + char Ch_1_Comp; + char Ch_2_Comp; + } var_3; + } variant; + } Rec_Type, *Rec_Pointer; + +#endif diff --git a/verif/tests/custom/dhrystone/dhrystone_main.c b/verif/tests/custom/dhrystone/dhrystone_main.c new file mode 100644 index 0000000000..9c7bcf544f --- /dev/null +++ b/verif/tests/custom/dhrystone/dhrystone_main.c @@ -0,0 +1,332 @@ +// See LICENSE for license details. + +//************************************************************************** +// Dhrystone bencmark +//-------------------------------------------------------------------------- +// +// This is the classic Dhrystone synthetic integer benchmark. +// + +#pragma GCC optimize ("no-inline") + +#include "dhrystone.h" + +void debug_printf(const char* str, ...); + +#include "util.h" + +#include + +/* Global Variables: */ + +Rec_Pointer Ptr_Glob, + Next_Ptr_Glob; +int Int_Glob; +Boolean Bool_Glob; +char Ch_1_Glob, + Ch_2_Glob; +int Arr_1_Glob [50]; +int Arr_2_Glob [50] [50]; + +Enumeration Func_1 (); + /* forward declaration necessary since Enumeration may not simply be int */ + +#ifndef REG + Boolean Reg = false; +#define REG + /* REG becomes defined as empty */ + /* i.e. no register variables */ +#else + Boolean Reg = true; +#undef REG +#define REG register +#endif + +Boolean Done; + +long Begin_Time, + End_Time, + User_Time; +long Microseconds, + Dhrystones_Per_Second; + +/* end of variables for time measurement */ + + +int main (int argc, char** argv) +/*****/ + /* main program, corresponds to procedures */ + /* Main and Proc_0 in the Ada version */ +{ + One_Fifty Int_1_Loc; + REG One_Fifty Int_2_Loc; + One_Fifty Int_3_Loc; + REG char Ch_Index; + Enumeration Enum_Loc; + Str_30 Str_1_Loc; + Str_30 Str_2_Loc; + REG int Run_Index; + REG int Number_Of_Runs; + + /* Arguments */ + Number_Of_Runs = NUMBER_OF_RUNS; + + /* Initializations */ + + Next_Ptr_Glob = (Rec_Pointer) alloca (sizeof (Rec_Type)); + Ptr_Glob = (Rec_Pointer) alloca (sizeof (Rec_Type)); + + Ptr_Glob->Ptr_Comp = Next_Ptr_Glob; + Ptr_Glob->Discr = Ident_1; + Ptr_Glob->variant.var_1.Enum_Comp = Ident_3; + Ptr_Glob->variant.var_1.Int_Comp = 40; + strcpy (Ptr_Glob->variant.var_1.Str_Comp, + "DHRYSTONE PROGRAM, SOME STRING"); + strcpy (Str_1_Loc, "DHRYSTONE PROGRAM, 1'ST STRING"); + + Arr_2_Glob [8][7] = 10; + /* Was missing in published program. Without this statement, */ + /* Arr_2_Glob [8][7] would have an undefined value. */ + /* Warning: With 16-Bit processors and Number_Of_Runs > 32000, */ + /* overflow may occur for this array element. */ + + debug_printf("\n"); + debug_printf("Dhrystone Benchmark, Version %s\n", Version); + if (Reg) + { + debug_printf("Program compiled with 'register' attribute\n"); + } + else + { + debug_printf("Program compiled without 'register' attribute\n"); + } + debug_printf("Using %s, HZ=%d\n", CLOCK_TYPE, HZ); + debug_printf("\n"); + + Done = false; + while (!Done) { + debug_printf("Trying %d runs through Dhrystone:\n", Number_Of_Runs); + + /***************/ + /* Start timer */ + /***************/ + + setStats(1); + Start_Timer(); + + for (Run_Index = 1; Run_Index <= Number_Of_Runs; ++Run_Index) + { + + Proc_5(); + Proc_4(); + /* Ch_1_Glob == 'A', Ch_2_Glob == 'B', Bool_Glob == true */ + Int_1_Loc = 2; + Int_2_Loc = 3; + strcpy (Str_2_Loc, "DHRYSTONE PROGRAM, 2'ND STRING"); + Enum_Loc = Ident_2; + Bool_Glob = ! Func_2 (Str_1_Loc, Str_2_Loc); + /* Bool_Glob == 1 */ + while (Int_1_Loc < Int_2_Loc) /* loop body executed once */ + { + Int_3_Loc = 5 * Int_1_Loc - Int_2_Loc; + /* Int_3_Loc == 7 */ + Proc_7 (Int_1_Loc, Int_2_Loc, &Int_3_Loc); + /* Int_3_Loc == 7 */ + Int_1_Loc += 1; + } /* while */ + /* Int_1_Loc == 3, Int_2_Loc == 3, Int_3_Loc == 7 */ + Proc_8 (Arr_1_Glob, Arr_2_Glob, Int_1_Loc, Int_3_Loc); + /* Int_Glob == 5 */ + Proc_1 (Ptr_Glob); + for (Ch_Index = 'A'; Ch_Index <= Ch_2_Glob; ++Ch_Index) + /* loop body executed twice */ + { + if (Enum_Loc == Func_1 (Ch_Index, 'C')) + /* then, not executed */ + { + Proc_6 (Ident_1, &Enum_Loc); + strcpy (Str_2_Loc, "DHRYSTONE PROGRAM, 3'RD STRING"); + Int_2_Loc = Run_Index; + Int_Glob = Run_Index; + } + } + /* Int_1_Loc == 3, Int_2_Loc == 3, Int_3_Loc == 7 */ + Int_2_Loc = Int_2_Loc * Int_1_Loc; + Int_1_Loc = Int_2_Loc / Int_3_Loc; + Int_2_Loc = 7 * (Int_2_Loc - Int_3_Loc) - Int_1_Loc; + /* Int_1_Loc == 1, Int_2_Loc == 13, Int_3_Loc == 7 */ + Proc_2 (&Int_1_Loc); + /* Int_1_Loc == 5 */ + + } /* loop "for Run_Index" */ + + /**************/ + /* Stop timer */ + /**************/ + + Stop_Timer(); + setStats(0); + + User_Time = End_Time - Begin_Time; + + if (User_Time < Too_Small_Time) + { + printf("Measured time too small to obtain meaningful results\n"); + Number_Of_Runs = Number_Of_Runs * 10; + printf("\n"); + } else Done = true; + } + + debug_printf("Final values of the variables used in the benchmark:\n"); + debug_printf("\n"); + debug_printf("Int_Glob: %d\n", Int_Glob); + debug_printf(" should be: %d\n", 5); + debug_printf("Bool_Glob: %d\n", Bool_Glob); + debug_printf(" should be: %d\n", 1); + debug_printf("Ch_1_Glob: %c\n", Ch_1_Glob); + debug_printf(" should be: %c\n", 'A'); + debug_printf("Ch_2_Glob: %c\n", Ch_2_Glob); + debug_printf(" should be: %c\n", 'B'); + debug_printf("Arr_1_Glob[8]: %d\n", Arr_1_Glob[8]); + debug_printf(" should be: %d\n", 7); + debug_printf("Arr_2_Glob[8][7]: %d\n", Arr_2_Glob[8][7]); + debug_printf(" should be: Number_Of_Runs + 10\n"); + debug_printf("Ptr_Glob->\n"); + debug_printf(" Ptr_Comp: %d\n", (long) Ptr_Glob->Ptr_Comp); + debug_printf(" should be: (implementation-dependent)\n"); + debug_printf(" Discr: %d\n", Ptr_Glob->Discr); + debug_printf(" should be: %d\n", 0); + debug_printf(" Enum_Comp: %d\n", Ptr_Glob->variant.var_1.Enum_Comp); + debug_printf(" should be: %d\n", 2); + debug_printf(" Int_Comp: %d\n", Ptr_Glob->variant.var_1.Int_Comp); + debug_printf(" should be: %d\n", 17); + debug_printf(" Str_Comp: %s\n", Ptr_Glob->variant.var_1.Str_Comp); + debug_printf(" should be: DHRYSTONE PROGRAM, SOME STRING\n"); + debug_printf("Next_Ptr_Glob->\n"); + debug_printf(" Ptr_Comp: %d\n", (long) Next_Ptr_Glob->Ptr_Comp); + debug_printf(" should be: (implementation-dependent), same as above\n"); + debug_printf(" Discr: %d\n", Next_Ptr_Glob->Discr); + debug_printf(" should be: %d\n", 0); + debug_printf(" Enum_Comp: %d\n", Next_Ptr_Glob->variant.var_1.Enum_Comp); + debug_printf(" should be: %d\n", 1); + debug_printf(" Int_Comp: %d\n", Next_Ptr_Glob->variant.var_1.Int_Comp); + debug_printf(" should be: %d\n", 18); + debug_printf(" Str_Comp: %s\n", + Next_Ptr_Glob->variant.var_1.Str_Comp); + debug_printf(" should be: DHRYSTONE PROGRAM, SOME STRING\n"); + debug_printf("Int_1_Loc: %d\n", Int_1_Loc); + debug_printf(" should be: %d\n", 5); + debug_printf("Int_2_Loc: %d\n", Int_2_Loc); + debug_printf(" should be: %d\n", 13); + debug_printf("Int_3_Loc: %d\n", Int_3_Loc); + debug_printf(" should be: %d\n", 7); + debug_printf("Enum_Loc: %d\n", Enum_Loc); + debug_printf(" should be: %d\n", 1); + debug_printf("Str_1_Loc: %s\n", Str_1_Loc); + debug_printf(" should be: DHRYSTONE PROGRAM, 1'ST STRING\n"); + debug_printf("Str_2_Loc: %s\n", Str_2_Loc); + debug_printf(" should be: DHRYSTONE PROGRAM, 2'ND STRING\n"); + debug_printf("\n"); + + + Microseconds = ((User_Time / Number_Of_Runs) * Mic_secs_Per_Second) / HZ; + Dhrystones_Per_Second = (HZ * Number_Of_Runs) / User_Time; + + printf("Microseconds for one run through Dhrystone: %ld\n", Microseconds); + printf("Dhrystones per Second: %ld\n", Dhrystones_Per_Second); + + return 0; +} + + +Proc_1 (Ptr_Val_Par) +/******************/ + +REG Rec_Pointer Ptr_Val_Par; + /* executed once */ +{ + REG Rec_Pointer Next_Record = Ptr_Val_Par->Ptr_Comp; + /* == Ptr_Glob_Next */ + /* Local variable, initialized with Ptr_Val_Par->Ptr_Comp, */ + /* corresponds to "rename" in Ada, "with" in Pascal */ + + structassign (*Ptr_Val_Par->Ptr_Comp, *Ptr_Glob); + Ptr_Val_Par->variant.var_1.Int_Comp = 5; + Next_Record->variant.var_1.Int_Comp + = Ptr_Val_Par->variant.var_1.Int_Comp; + Next_Record->Ptr_Comp = Ptr_Val_Par->Ptr_Comp; + Proc_3 (&Next_Record->Ptr_Comp); + /* Ptr_Val_Par->Ptr_Comp->Ptr_Comp + == Ptr_Glob->Ptr_Comp */ + if (Next_Record->Discr == Ident_1) + /* then, executed */ + { + Next_Record->variant.var_1.Int_Comp = 6; + Proc_6 (Ptr_Val_Par->variant.var_1.Enum_Comp, + &Next_Record->variant.var_1.Enum_Comp); + Next_Record->Ptr_Comp = Ptr_Glob->Ptr_Comp; + Proc_7 (Next_Record->variant.var_1.Int_Comp, 10, + &Next_Record->variant.var_1.Int_Comp); + } + else /* not executed */ + structassign (*Ptr_Val_Par, *Ptr_Val_Par->Ptr_Comp); +} /* Proc_1 */ + + +Proc_2 (Int_Par_Ref) +/******************/ + /* executed once */ + /* *Int_Par_Ref == 1, becomes 4 */ + +One_Fifty *Int_Par_Ref; +{ + One_Fifty Int_Loc; + Enumeration Enum_Loc; + + Int_Loc = *Int_Par_Ref + 10; + do /* executed once */ + if (Ch_1_Glob == 'A') + /* then, executed */ + { + Int_Loc -= 1; + *Int_Par_Ref = Int_Loc - Int_Glob; + Enum_Loc = Ident_1; + } /* if */ + while (Enum_Loc != Ident_1); /* true */ +} /* Proc_2 */ + + +Proc_3 (Ptr_Ref_Par) +/******************/ + /* executed once */ + /* Ptr_Ref_Par becomes Ptr_Glob */ + +Rec_Pointer *Ptr_Ref_Par; + +{ + if (Ptr_Glob != Null) + /* then, executed */ + *Ptr_Ref_Par = Ptr_Glob->Ptr_Comp; + Proc_7 (10, Int_Glob, &Ptr_Glob->variant.var_1.Int_Comp); +} /* Proc_3 */ + + +Proc_4 () /* without parameters */ +/*******/ + /* executed once */ +{ + Boolean Bool_Loc; + + Bool_Loc = Ch_1_Glob == 'A'; + Bool_Glob = Bool_Loc | Bool_Glob; + Ch_2_Glob = 'B'; +} /* Proc_4 */ + + +Proc_5 () /* without parameters */ +/*******/ + /* executed once */ +{ + Ch_1_Glob = 'A'; + Bool_Glob = false; +} /* Proc_5 */ diff --git a/verif/tests/custom/isacov/load_reg_hazard.S b/verif/tests/custom/isacov/load_reg_hazard.S index 15c1c3eece..2b63b4d9b5 100644 --- a/verif/tests/custom/isacov/load_reg_hazard.S +++ b/verif/tests/custom/isacov/load_reg_hazard.S @@ -384,6 +384,27 @@ main: c.lhu a4, 0(a4) c.lhu a5, 0(a5) + li s0, 0x80000000 + li s1, 0x80000000 + li a0, 0x80000000 + li a1, 0x80000000 + li a2, 0x80000000 + li a3, 0x80000000 + li a4, 0x80000000 + li a5, 0x80000000 + + c.lw s0, 0(s0) + c.lw s1, 0(s1) + c.lw a0, 0(a0) + c.lw a1, 0(a1) + c.lw a2, 0(a2) + c.lw a3, 0(a3) + c.lw a4, 0(a4) + c.lw a5, 0(a5) + + li sp, 0x80000000 + c.lwsp sp, 0(sp) + #End of test j test_pass diff --git a/verif/tests/custom/return0/return0.c b/verif/tests/custom/return0/return0.c new file mode 100644 index 0000000000..6b20abee28 --- /dev/null +++ b/verif/tests/custom/return0/return0.c @@ -0,0 +1,22 @@ +/* +** +** Copyright 2020 OpenHW Group +** +** Licensed under the Solderpad Hardware Licence, Version 2.0 (the "License"); +** you may not use this file except in compliance with the License. +** You may obtain a copy of the License at +** +** https://solderpad.org/licenses/ +** +** Unless required by applicable law or agreed to in writing, software +** distributed under the License is distributed on an "AS IS" BASIS, +** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +** See the License for the specific language governing permissions and +** limitations under the License. +** +*/ + +int main() { + + return 0; +} diff --git a/verif/tests/testlist_csr_embedded.yaml b/verif/tests/testlist_csr_embedded.yaml index ab15da4ae6..b790060a70 100644 --- a/verif/tests/testlist_csr_embedded.yaml +++ b/verif/tests/testlist_csr_embedded.yaml @@ -28,7 +28,7 @@ common_test_config: &common_test_config path_var: TESTS_PATH - gcc_opts: "-static -mcmodel=medany -fvisibility=hidden -nostdlib -nostartfiles ../tests/custom/common/syscalls.c ../tests/custom/common/crt.S -I../tests/custom/env -I../tests/custom/common -T ../tests/custom/common/test.ld -lgcc" + gcc_opts: "-static -mcmodel=medany -fvisibility=hidden -nostdlib -nostartfiles ../tests/custom/common/syscalls.c ../tests/custom/common/crt.S -I../tests/custom/env -I../tests/custom/common -lgcc" testlist: - test: csr_test diff --git a/verif/tests/testlist_custom.yaml b/verif/tests/testlist_custom.yaml index c326a286a2..7f3c125cbc 100644 --- a/verif/tests/testlist_custom.yaml +++ b/verif/tests/testlist_custom.yaml @@ -33,7 +33,7 @@ common_test_config: &common_test_config path_var: TESTS_PATH - gcc_opts: "-static -mcmodel=medany -fvisibility=hidden -nostdlib -nostartfiles ../tests/custom/common/syscalls.c ../tests/custom/common/crt.S -I../tests/custom/env -I../tests/custom/common -T ../tests/custom/common/test.ld" + gcc_opts: "-static -mcmodel=medany -fvisibility=hidden -nostdlib -nostartfiles ../tests/custom/common/syscalls.c ../tests/custom/common/crt.S -I../tests/custom/env -I../tests/custom/common" testlist: - test: custom_test_template diff --git a/verif/tests/testlist_cvxif.yaml b/verif/tests/testlist_cvxif.yaml index ac6d6ef213..963a89ea44 100644 --- a/verif/tests/testlist_cvxif.yaml +++ b/verif/tests/testlist_cvxif.yaml @@ -28,18 +28,23 @@ common_test_config: &common_test_config path_var: TESTS_PATH - gcc_opts: "-static -mcmodel=medany -fvisibility=hidden -nostdlib -nostartfiles ../tests/custom/common/syscalls.c ../tests/custom/common/crt.S -I../tests/custom/env -I../tests/custom/common -T ../tests/custom/common/test.ld" + gcc_opts: "-static -mcmodel=medany -fvisibility=hidden -nostdlib -nostartfiles ../tests/custom/common/syscalls.c ../tests/custom/common/crt.S -I../tests/custom/env -I../tests/custom/common" common_test_config_lgcc: &common_test_config_lgcc path_var: TESTS_PATH - gcc_opts: "-static -mcmodel=medany -fvisibility=hidden -nostdlib -nostartfiles ../tests/custom/common/syscalls.c ../tests/custom/common/crt.S -I../tests/custom/env -I../tests/custom/common -T ../tests/custom/common/test.ld -lgcc" + gcc_opts: "-static -mcmodel=medany -fvisibility=hidden -nostdlib -nostartfiles ../tests/custom/common/syscalls.c ../tests/custom/common/crt.S -I../tests/custom/env -I../tests/custom/common -lgcc" testlist: - test: cvxif_add_nop - <<: *common_test_config - iterations: 0 + <<: *common_test_config_lgcc + iterations: 1 asm_tests: /custom/cv_xif/cvxif_add_nop.S + - test: cvxif_full + <<: *common_test_config_lgcc + iterations: 1 + asm_tests: /custom/cv_xif/cvxif_full.S + - test: cvxif_multi <<: *common_test_config iterations: 0 diff --git a/verif/tests/testlist_interrupt.yaml b/verif/tests/testlist_interrupt.yaml index a3cd8ab8b3..00255d8785 100644 --- a/verif/tests/testlist_interrupt.yaml +++ b/verif/tests/testlist_interrupt.yaml @@ -28,5 +28,5 @@ - test: jump_to_zero iterations: 1 path_var: TESTS_PATH - gcc_opts: "-static -mcmodel=medany -fvisibility=hidden -nostdlib -nostartfiles ../tests/custom/common/syscalls.c ../tests/custom/common/crt.S -I../tests/custom/env -I../tests/custom/common -T ../tests/custom/common/test.ld -lgcc" + gcc_opts: "-static -mcmodel=medany -fvisibility=hidden -nostdlib -nostartfiles ../tests/custom/common/syscalls.c ../tests/custom/common/crt.S -I../tests/custom/env -I../tests/custom/common -lgcc" asm_tests: /custom/interrupt/jump_to_zero.S diff --git a/verif/tests/testlist_isacov.yaml b/verif/tests/testlist_isacov.yaml index 2a8b5b87f8..361ae97c7c 100644 --- a/verif/tests/testlist_isacov.yaml +++ b/verif/tests/testlist_isacov.yaml @@ -28,7 +28,7 @@ common_test_config: &common_test_config path_var: TESTS_PATH - gcc_opts: "-static -mcmodel=medany -fvisibility=hidden -nostdlib -nostartfiles ../tests/custom/common/syscalls.c ../tests/custom/common/crt.S -I../tests/custom/env -I../tests/custom/common -T ../tests/custom/common/test.ld -lgcc" + gcc_opts: "-static -mcmodel=medany -fvisibility=hidden -nostdlib -nostartfiles ../tests/custom/common/syscalls.c ../tests/custom/common/crt.S -I../tests/custom/env -I../tests/custom/common -lgcc" testlist: - test: branch_test diff --git a/verif/tests/testlist_issues.yaml b/verif/tests/testlist_issues.yaml index 4703854dea..92913456ce 100644 --- a/verif/tests/testlist_issues.yaml +++ b/verif/tests/testlist_issues.yaml @@ -32,7 +32,7 @@ # -------------------------------------------------------------------------------- common_test_config: &common_test_config path_var: TESTS_PATH - gcc_opts: "-static -misa-spec=2.2 -mcmodel=medany -fvisibility=hidden -nostdlib -nostartfiles ../tests/custom/common/syscalls.c ../tests/custom/common/crt.S -I../tests/custom/env -I../tests/custom/common -T ../tests/custom/common/test.ld -lgcc" + gcc_opts: "-static -misa-spec=2.2 -mcmodel=medany -fvisibility=hidden -nostdlib -nostartfiles ../tests/custom/common/syscalls.c ../tests/custom/common/crt.S -I../tests/custom/env -I../tests/custom/common -T ../../config/gen_from_riscv_config/linker/link.ld -lgcc" testlist: - test: compressed-fpreg-commits-rv64 diff --git a/verif/tests/testlist_riscv-compliance-cv32a65x.yaml b/verif/tests/testlist_riscv-compliance-cv32a65x.yaml index 76292f112b..ee4a47217a 100644 --- a/verif/tests/testlist_riscv-compliance-cv32a65x.yaml +++ b/verif/tests/testlist_riscv-compliance-cv32a65x.yaml @@ -354,7 +354,7 @@ testlist: asm_tests: /riscv-compliance/riscv-test-suite/rv32mi/src/scall.S - test: rv32mi-ma_addr - iterations: 1 + iterations: 0 # spike needs to be configured with tval_en=1 <<: *common_test_config asm_tests: /riscv-compliance/riscv-test-suite/rv32mi/src/ma_addr.S @@ -409,7 +409,7 @@ testlist: asm_tests: /riscv-compliance/riscv-test-suite/rv32i/src/I-SB-01.S - test: rv32i-I-MISALIGN_LDST-01 - iterations: 1 + iterations: 0 # spike needs to be configured with tval_en=1 <<: *common_test_config asm_tests: /riscv-compliance/riscv-test-suite/rv32i/src/I-MISALIGN_LDST-01.S @@ -468,8 +468,8 @@ testlist: <<: *common_test_config asm_tests: /riscv-compliance/riscv-test-suite/rv32i/src/I-DELAY_SLOTS-01.S - - test: rv32i-I-EBREAK-01 # infinite loop with spike - iterations: 0 + - test: rv32i-I-EBREAK-01 + iterations: 0 # infinite loop with spike <<: *common_test_config asm_tests: /riscv-compliance/riscv-test-suite/rv32i/src/I-EBREAK-01.S @@ -488,8 +488,8 @@ testlist: <<: *common_test_config asm_tests: /riscv-compliance/riscv-test-suite/rv32i/src/I-OR-01.S - - test: rv32i-I-MISALIGN_JMP-01 # infinite loop with spike - iterations: 0 + - test: rv32i-I-MISALIGN_JMP-01 + iterations: 0 # infinite loop with spike <<: *common_test_config asm_tests: /riscv-compliance/riscv-test-suite/rv32i/src/I-MISALIGN_JMP-01.S @@ -624,32 +624,32 @@ testlist: asm_tests: /riscv-compliance/riscv-test-suite/rv32i/src/I-ADD-01.S - test: rv32si-sbreak - iterations: 1 + iterations: 0 # needs supervisor mode <<: *common_test_config asm_tests: /riscv-compliance/riscv-test-suite/rv32si/src/sbreak.S - test: rv32si-scall - iterations: 1 + iterations: 0 # needs supervisor mode <<: *common_test_config asm_tests: /riscv-compliance/riscv-test-suite/rv32si/src/scall.S - test: rv32si-ma_fetch - iterations: 1 + iterations: 0 # needs supervisor mode <<: *common_test_config asm_tests: /riscv-compliance/riscv-test-suite/rv32si/src/ma_fetch.S - test: rv32si-wfi - iterations: 1 + iterations: 0 # needs supervisor mode <<: *common_test_config asm_tests: /riscv-compliance/riscv-test-suite/rv32si/src/wfi.S - test: rv32si-dirty - iterations: 0 # exception on spike + iterations: 0 # exception on spike # needs supervisor mode <<: *common_test_config asm_tests: /riscv-compliance/riscv-test-suite/rv32si/src/dirty.S - test: rv32si-csr - iterations: 1 + iterations: 0 # needs supervisor mode <<: *common_test_config asm_tests: /riscv-compliance/riscv-test-suite/rv32si/src/csr.S diff --git a/verif/tests/testlist_riscv-mmu-sv32-arch-test-cv32a60x.yaml b/verif/tests/testlist_riscv-mmu-sv32-arch-test-cv32a6_imac_sv32.yaml similarity index 100% rename from verif/tests/testlist_riscv-mmu-sv32-arch-test-cv32a60x.yaml rename to verif/tests/testlist_riscv-mmu-sv32-arch-test-cv32a6_imac_sv32.yaml diff --git a/verif/tests/testlist_riscv-tests-cv32a65x-p.yaml b/verif/tests/testlist_riscv-tests-cv32a65x-p.yaml index bdafab5f59..15db4a51dc 100644 --- a/verif/tests/testlist_riscv-tests-cv32a65x-p.yaml +++ b/verif/tests/testlist_riscv-tests-cv32a65x-p.yaml @@ -255,7 +255,7 @@ testlist: asm_tests: /riscv-tests/isa/rv32mi/illegal.S - test: rv32mi-p-ma_addr - iterations: 1 + iterations: 0 # spike needs to be configured with tval_en = 1 <<: *common_test_config asm_tests: /riscv-tests/isa/rv32mi/ma_addr.S @@ -280,32 +280,32 @@ testlist: asm_tests: /riscv-tests/isa/rv32mi/shamt.S - test: rv32si-p-csr - iterations: 1 + iterations: 0 # needs supervisor mode <<: *common_test_config asm_tests: /riscv-tests/isa/rv32si/csr.S - test: rv32si-p-ma_fetch - iterations: 1 + iterations: 0 # needs supervisor mode <<: *common_test_config asm_tests: /riscv-tests/isa/rv32si/ma_fetch.S - test: rv32si-p-scall - iterations: 1 + iterations: 0 # needs supervisor mode <<: *common_test_config asm_tests: /riscv-tests/isa/rv32si/scall.S - test: rv32si-p-wfi - iterations: 1 + iterations: 0 # needs supervisor mode <<: *common_test_config asm_tests: /riscv-tests/isa/rv32si/wfi.S - test: rv32si-p-sbreak - iterations: 1 + iterations: 0 # needs supervisor mode <<: *common_test_config asm_tests: /riscv-tests/isa/rv32si/sbreak.S - test: rv32si-p-dirty - iterations: 0 # to be explained + iterations: 0 # needs supervisor mode <<: *common_test_config asm_tests: /riscv-tests/isa/rv32si/dirty.S diff --git a/verif/tests/uvmt/base-tests/uvmt_cva6_base_test.sv b/verif/tests/uvmt/base-tests/uvmt_cva6_base_test.sv index 66c4bd8e16..ffcac3a184 100644 --- a/verif/tests/uvmt/base-tests/uvmt_cva6_base_test.sv +++ b/verif/tests/uvmt/base-tests/uvmt_cva6_base_test.sv @@ -70,7 +70,14 @@ class uvmt_cva6_base_test_c extends uvm_test; constraint env_cfg_cons { env_cfg.enabled == 1; env_cfg.is_active == UVM_ACTIVE; - env_cfg.trn_log_enabled == 1; + if (!env_cfg.performance_mode) { + env_cfg.trn_log_enabled == 1; + } else { + env_cfg.trn_log_enabled == 0; + env_cfg.cov_model_enabled == 0; + env_cfg.force_disable_csr_checks == 1; + env_cfg.scoreboard_enabled == 0; + } } constraint axi_agent_cfg_cons { @@ -236,6 +243,12 @@ function void uvmt_cva6_base_test_c::build_phase(uvm_phase phase); pkg_to_cfg (); cfg_hrtbt_monitor(); assign_cfg (); + + if (test_cfg.mem_vp_enabled == 1) begin + set_type_override_by_type(uvml_mem_c#(cva6_config_pkg::CVA6ConfigAxiAddrWidth)::get_type(), + uvml_mem_vp_c#(cva6_config_pkg::CVA6ConfigAxiAddrWidth)::get_type()); + end + create_cntxt (); assign_cntxt (); create_env (); diff --git a/verif/tests/uvmt/base-tests/uvmt_cva6_test_cfg.sv b/verif/tests/uvmt/base-tests/uvmt_cva6_test_cfg.sv index fb719ea16a..c29f177987 100644 --- a/verif/tests/uvmt/base-tests/uvmt_cva6_test_cfg.sv +++ b/verif/tests/uvmt/base-tests/uvmt_cva6_test_cfg.sv @@ -62,6 +62,9 @@ class uvmt_cva6_test_cfg_c extends uvm_object; bit cli_uvm_banner_select_override = 0; string cli_uvm_banner_name_str = ""; + // Virtual Peripherals + bit mem_vp_enabled = 0; + // Run-time control bit run_riscv_gcc_toolchain = 0; bit print_uvm_runflow_banner = 0; @@ -143,6 +146,11 @@ function void uvmt_cva6_test_cfg_c::process_cli_args(); end end + mem_vp_enabled = 0; // default + if ($value$plusargs("mem_vp_enabled=%b", mem_vp_enabled)) begin + `uvm_info("TEST_CFG", $sformatf("process_cli_args() virtual peripherals mem_vp_enabled=0x%0x", mem_vp_enabled), UVM_LOW) + end + `uvm_info("TEST_CFG", "process_cli_args() complete", UVM_HIGH) endfunction : process_cli_args diff --git a/verif/tests/uvmt/compliance-tests/uvmt_cva6_firmware_test.sv b/verif/tests/uvmt/compliance-tests/uvmt_cva6_firmware_test.sv index 79c971554b..64b491b3dc 100644 --- a/verif/tests/uvmt/compliance-tests/uvmt_cva6_firmware_test.sv +++ b/verif/tests/uvmt/compliance-tests/uvmt_cva6_firmware_test.sv @@ -168,9 +168,14 @@ task uvmt_cva6_firmware_test_c::run_phase(uvm_phase phase); uvm_config_db#(int)::set(null, "", "test_exit_code", { 0'b0, tb_exit_vif.tb_exit_o[31:1] }); // Let the termination-triggering instruction appear in the log. @(posedge env_cntxt.clknrst_cntxt.vif.clk); + // Let all pending AXI requests settle. + // FIXME TODO: Insert this delay in AXI agent rather than here, + // based on AXI state and latency setting. + `uvm_info("TEST", "Running a 100-cycle delay to settle AXI requests...", UVM_NONE); + repeat (100) @(posedge env_cntxt.clknrst_cntxt.vif.clk); + `uvm_info("TEST", "Running a 100-cycle delay to settle AXI requests... DONE", UVM_NONE); // Allow termination from now on. phase.drop_objection(this); - repeat (100) @(posedge env_cntxt.clknrst_cntxt.vif.clk); endtask : run_phase