From e2d895511dacd262b6d85f5d79aef9d62beb6650 Mon Sep 17 00:00:00 2001 From: Matteo Perotti Date: Sat, 24 Aug 2024 16:16:46 +0200 Subject: [PATCH] [apps] :art: clang-format pass --- apps/dtype-conv3d/kernel/bp-iconv3d.c | 121 ++++++++++---------------- apps/dtype-conv3d/kernel/bp-iconv3d.h | 20 ++--- apps/dtype-conv3d/kernel/dp-fconv3d.c | 16 ++-- apps/dtype-conv3d/kernel/dp-fconv3d.h | 20 ++--- apps/dtype-conv3d/kernel/dp-iconv3d.c | 81 ++++++++--------- apps/dtype-conv3d/kernel/dp-iconv3d.h | 20 ++--- apps/dtype-conv3d/kernel/hp-fconv3d.c | 87 +++++++++--------- apps/dtype-conv3d/kernel/hp-fconv3d.h | 24 ++--- apps/dtype-conv3d/kernel/hp-iconv3d.c | 81 ++++++++--------- apps/dtype-conv3d/kernel/hp-iconv3d.h | 20 ++--- apps/dtype-conv3d/kernel/sp-fconv3d.c | 98 ++++++++++----------- apps/dtype-conv3d/kernel/sp-fconv3d.h | 14 +-- apps/dtype-conv3d/kernel/sp-iconv3d.c | 81 ++++++++--------- apps/dtype-conv3d/kernel/sp-iconv3d.h | 20 ++--- apps/dtype-conv3d/main.c | 4 +- 15 files changed, 318 insertions(+), 389 deletions(-) diff --git a/apps/dtype-conv3d/kernel/bp-iconv3d.c b/apps/dtype-conv3d/kernel/bp-iconv3d.c index 723b6760c..70ca4e5de 100644 --- a/apps/dtype-conv3d/kernel/bp-iconv3d.c +++ b/apps/dtype-conv3d/kernel/bp-iconv3d.c @@ -51,19 +51,19 @@ Change vse64.v and store instructions. Adjust pointer arithmetic: Adjust the stride values for pointer increments (ldo, ldi_pad) - ldo (likely stands for "load output"): This is the stride value used to move to the next row in the output matrix. - ldi_pad (likely stands for "load input padded"): This is the stride value used to move to the next row in the padded input matrix. - Adjust data type in arithmetic instructions: - vfmacc.vf for float, vmacc.vx for int - Same for add and slidedown instruction + ldo (likely stands for "load output"): This is the stride value used to move + to the next row in the output matrix. ldi_pad (likely stands for "load input + padded"): This is the stride value used to move to the next row in the padded + input matrix. Adjust data type in arithmetic instructions: vfmacc.vf for + float, vmacc.vx for int Same for add and slidedown instruction */ #include "bp-iconv3d.h" extern int64_t event_trigger; -int bp_iconv3d_verify(int8_t *matrix, int8_t *golden_matrix, int64_t R, int64_t C, - int64_t threshold) { +int bp_iconv3d_verify(int8_t *matrix, int8_t *golden_matrix, int64_t R, + int64_t C, int64_t threshold) { for (int r = 0; r < R; ++r) for (int c = 0; c < C; ++c) if (!similarity_check(matrix[c + C * r], golden_matrix[c + C * r], @@ -76,7 +76,7 @@ int bp_iconv3d_verify(int8_t *matrix, int8_t *golden_matrix, int64_t R, int64_t } void bp_iconv3d_CHx7x7(int8_t *o, int8_t *i, int8_t *f, int64_t M, int64_t N, - int64_t C, int64_t F) { + int64_t C, int64_t F) { unsigned long int block_size_n; @@ -98,8 +98,8 @@ void bp_iconv3d_CHx7x7(int8_t *o, int8_t *i, int8_t *f, int64_t M, int64_t N, } } -void bp_iconv3d_CHx7x7_warm(int8_t *o, int8_t *i, int8_t *f, int64_t M, int64_t N, - int64_t C, int64_t F) { +void bp_iconv3d_CHx7x7_warm(int8_t *o, int8_t *i, int8_t *f, int64_t M, + int64_t N, int64_t C, int64_t F) { unsigned long int block_size_n; @@ -121,8 +121,8 @@ void bp_iconv3d_CHx7x7_warm(int8_t *o, int8_t *i, int8_t *f, int64_t M, int64_t } } -void bp_iconv3d_CHx7x7_block(int8_t *o, int8_t *i, int8_t *f, int64_t M, int64_t N, - int64_t n_, int64_t C, int64_t F) { +void bp_iconv3d_CHx7x7_block(int8_t *o, int8_t *i, int8_t *f, int64_t M, + int64_t N, int64_t n_, int64_t C, int64_t F) { // Helper variables int64_t ldo = N; @@ -175,15 +175,9 @@ void bp_iconv3d_CHx7x7_block(int8_t *o, int8_t *i, int8_t *f, int64_t M, int64_t i_slide_ptr_3 = i__ + n_ + 3 * (N + F - 1); // Load four input rows belonging to channel ch - asm volatile("vle8.v v0, (%0); add %0, %0, %1" - : "+&r"(i__) - : "r"(ldi_pad)); - asm volatile("vle8.v v4, (%0); add %0, %0, %1" - : "+&r"(i__) - : "r"(ldi_pad)); - asm volatile("vle8.v v8, (%0); add %0, %0, %1" - : "+&r"(i__) - : "r"(ldi_pad)); + asm volatile("vle8.v v0, (%0); add %0, %0, %1" : "+&r"(i__) : "r"(ldi_pad)); + asm volatile("vle8.v v4, (%0); add %0, %0, %1" : "+&r"(i__) : "r"(ldi_pad)); + asm volatile("vle8.v v8, (%0); add %0, %0, %1" : "+&r"(i__) : "r"(ldi_pad)); asm volatile("vle8.v v12, (%0); add %0, %0, %1" : "+&r"(i__) : "r"(ldi_pad)); @@ -275,12 +269,8 @@ void bp_iconv3d_CHx7x7_block(int8_t *o, int8_t *i, int8_t *f, int64_t M, int64_t i_slide_ptr_1 = i__ + n_ + 1 * (N + F - 1); i_slide_ptr_2 = i__ + n_ + 2 * (N + F - 1); - asm volatile("vle8.v v2, (%0); add %0, %0, %1" - : "+&r"(i__) - : "r"(ldi_pad)); - asm volatile("vle8.v v6, (%0); add %0, %0, %1" - : "+&r"(i__) - : "r"(ldi_pad)); + asm volatile("vle8.v v2, (%0); add %0, %0, %1" : "+&r"(i__) : "r"(ldi_pad)); + asm volatile("vle8.v v6, (%0); add %0, %0, %1" : "+&r"(i__) : "r"(ldi_pad)); asm volatile("vle8.v v10, (%0); add %0, %0, %1" : "+&r"(i__) : "r"(ldi_pad)); @@ -710,15 +700,9 @@ void bp_iconv3d_CHx7x7_block(int8_t *o, int8_t *i, int8_t *f, int64_t M, int64_t i_slide_ptr_3 = i__ + n_ + 3 * (N + F - 1); // Load other three input rows (one was already loaded) - asm volatile("vle8.v v0, (%0); add %0, %0, %1" - : "+&r"(i__) - : "r"(ldi_pad)); - asm volatile("vle8.v v4, (%0); add %0, %0, %1" - : "+&r"(i__) - : "r"(ldi_pad)); - asm volatile("vle8.v v8, (%0); add %0, %0, %1" - : "+&r"(i__) - : "r"(ldi_pad)); + asm volatile("vle8.v v0, (%0); add %0, %0, %1" : "+&r"(i__) : "r"(ldi_pad)); + asm volatile("vle8.v v4, (%0); add %0, %0, %1" : "+&r"(i__) : "r"(ldi_pad)); + asm volatile("vle8.v v8, (%0); add %0, %0, %1" : "+&r"(i__) : "r"(ldi_pad)); asm volatile("vle8.v v12, (%0); add %0, %0, %1" : "+&r"(i__) : "r"(ldi_pad)); @@ -864,12 +848,8 @@ void bp_iconv3d_CHx7x7_block(int8_t *o, int8_t *i, int8_t *f, int64_t M, int64_t i_slide_ptr_1 = i__ + n_ + 1 * (N + F - 1); i_slide_ptr_2 = i__ + n_ + 2 * (N + F - 1); - asm volatile("vle8.v v2, (%0); add %0, %0, %1" - : "+&r"(i__) - : "r"(ldi_pad)); - asm volatile("vle8.v v6, (%0); add %0, %0, %1" - : "+&r"(i__) - : "r"(ldi_pad)); + asm volatile("vle8.v v2, (%0); add %0, %0, %1" : "+&r"(i__) : "r"(ldi_pad)); + asm volatile("vle8.v v6, (%0); add %0, %0, %1" : "+&r"(i__) : "r"(ldi_pad)); asm volatile("vle8.v v10, (%0); add %0, %0, %1" : "+&r"(i__) : "r"(ldi_pad)); @@ -928,9 +908,8 @@ void bp_iconv3d_CHx7x7_block(int8_t *o, int8_t *i, int8_t *f, int64_t M, int64_t asm volatile("vse8.v v28, (%0); add %0, %0, %1" : "+&r"(o) : "r"(ldo)); } - void bp_iconv3d_warm(int8_t *o, int8_t *i, int8_t *f, int64_t M, int64_t N, - int64_t n_, int64_t C, int64_t F) { + int64_t n_, int64_t C, int64_t F) { // Helper variables int64_t ldo = N; @@ -1027,7 +1006,6 @@ void bp_iconv3d_warm(int8_t *o, int8_t *i, int8_t *f, int64_t M, int64_t N, i_slide_ptr_1 = i__ + n_ + 1 * (N + F - 1); i_slide_ptr_2 = i__ + n_ + 2 * (N + F - 1); - // Main kernel, unrolled by 2 for (int k = 0; k < F / 2; ++k) { // Two base indexes because of the unrolling @@ -1063,7 +1041,6 @@ void bp_iconv3d_warm(int8_t *o, int8_t *i, int8_t *f, int64_t M, int64_t N, asm volatile("vslide1down.vx v6, v4, %0" ::"r"(*i_slide_ptr_1++)); asm volatile("vslide1down.vx v10, v8, %0" ::"r"(*i_slide_ptr_2++)); - } // The very last iterations require mixing the instructions with the store @@ -1080,7 +1057,6 @@ void bp_iconv3d_warm(int8_t *o, int8_t *i, int8_t *f, int64_t M, int64_t N, // Reuse preloaded coefficients // Buffer the next coefficients for faster use - // Bump the input ptr i_ += 3 * (N + F - 1); @@ -1145,43 +1121,41 @@ void bp_iconv3d_warm(int8_t *o, int8_t *i, int8_t *f, int64_t M, int64_t N, if (ch != C - 1) { int64_t base_idx_0 = (ch + 1) * fch_len; - } } - } + } - // Bump the input ptr - i_ += N + F - 1; + // Bump the input ptr + i_ += N + F - 1; #ifdef VCD_DUMP - // Stop dumping VCD - event_trigger = -1; + // Stop dumping VCD + event_trigger = -1; #endif - ////////////// - // UNROLL 1 // - ////////////// + ////////////// + // UNROLL 1 // + ////////////// - // Loop on the channels - for (int ch = 0; ch < C; ++ch) { + // Loop on the channels + for (int ch = 0; ch < C; ++ch) { - // Point to the first element of the channel ch - i__ = i_ + ch * ich_len; + // Point to the first element of the channel ch + i__ = i_ + ch * ich_len; - // Start calculating the next pointers to the elements to be slided in - i_slide_ptr_1 = i__ + n_; + // Start calculating the next pointers to the elements to be slided in + i_slide_ptr_1 = i__ + n_; - for (int k = 0; k < F / 2; ++k) { - // Two base indexes because of the unrolling - // Point to the first element of the current column (k) of the current - // channel (ch) of the filter (f) - int64_t base_idx_0 = (2 * k + 2) + (ch * fch_len); - // Point to the first element of the current column (k+1) of the current - // channel (ch) of the filter (f) - int64_t base_idx_1 = (2 * k + 1) + (ch * fch_len); + for (int k = 0; k < F / 2; ++k) { + // Two base indexes because of the unrolling + // Point to the first element of the current column (k) of the current + // channel (ch) of the filter (f) + int64_t base_idx_0 = (2 * k + 2) + (ch * fch_len); + // Point to the first element of the current column (k+1) of the current + // channel (ch) of the filter (f) + int64_t base_idx_1 = (2 * k + 1) + (ch * fch_len); } - // Bump the input ptr i_ += N + F - 1; } @@ -1217,7 +1191,7 @@ void bp_iconv3d_warm(int8_t *o, int8_t *i, int8_t *f, int64_t M, int64_t N, if ((k | ch) == 0) asm volatile("vmul.vx v28, v0, %0" ::"r"(f[0 + base_idx_0])); else - asm volatile("vslide1down.vx v6, v4, %0" ::"r"(*i_slide_ptr_1++)); + asm volatile("vslide1down.vx v6, v4, %0" ::"r"(*i_slide_ptr_1++)); asm volatile("vslide1down.vx v10, v8, %0" ::"r"(*i_slide_ptr_2++)); asm volatile("vslide1down.vx v14, v12, %0" ::"r"(*i_slide_ptr_3++)); @@ -1226,10 +1200,8 @@ void bp_iconv3d_warm(int8_t *o, int8_t *i, int8_t *f, int64_t M, int64_t N, asm volatile("vslide1down.vx v8, v10, %0" ::"r"(*i_slide_ptr_2++)); asm volatile("vslide1down.vx v12, v14, %0" ::"r"(*i_slide_ptr_3++)); } - } - // Bump the input ptr i_ += 4 * (N + F - 1); @@ -1272,7 +1244,6 @@ void bp_iconv3d_warm(int8_t *o, int8_t *i, int8_t *f, int64_t M, int64_t N, } } - /* //////////////////// // MAIN ALGOMITHM // diff --git a/apps/dtype-conv3d/kernel/bp-iconv3d.h b/apps/dtype-conv3d/kernel/bp-iconv3d.h index cc9134303..f7ca39df6 100644 --- a/apps/dtype-conv3d/kernel/bp-iconv3d.h +++ b/apps/dtype-conv3d/kernel/bp-iconv3d.h @@ -19,10 +19,10 @@ #ifndef ICONV3D_H #define ICONV3D_H +#include "printf.h" +#include "util.h" #include #include -#include "util.h" -#include "printf.h" // Threshold for FP numbers comparison during the final check #define THRESHOLD 0 @@ -34,19 +34,19 @@ #define DATA_WIDTH "int8" void bp_iconv3d_CHx7x7(int8_t *o, int8_t *i, int8_t *f, int64_t M, int64_t N, - int64_t C, int64_t F); + int64_t C, int64_t F); -void bp_iconv3d_CHx7x7_block(int8_t *o, int8_t *i, int8_t *f, int64_t M, int64_t N, - int64_t n_, int64_t C, int64_t F); +void bp_iconv3d_CHx7x7_block(int8_t *o, int8_t *i, int8_t *f, int64_t M, + int64_t N, int64_t n_, int64_t C, int64_t F); -void bp_iconv3d_CHx7x7_warm(int8_t *o, int8_t *i, int8_t *f, int64_t M, int64_t N, - int64_t C, int64_t F); +void bp_iconv3d_CHx7x7_warm(int8_t *o, int8_t *i, int8_t *f, int64_t M, + int64_t N, int64_t C, int64_t F); void bp_iconv3d_warm(int8_t *o, int8_t *i, int8_t *f, int64_t M, int64_t N, - int64_t n_, int64_t C, int64_t F); + int64_t n_, int64_t C, int64_t F); // Verify the matrices -int bp_iconv3d_verify(int8_t *matrix, int8_t *golden_matrix, int64_t R, int64_t C, - int64_t threshold); +int bp_iconv3d_verify(int8_t *matrix, int8_t *golden_matrix, int64_t R, + int64_t C, int64_t threshold); #endif diff --git a/apps/dtype-conv3d/kernel/dp-fconv3d.c b/apps/dtype-conv3d/kernel/dp-fconv3d.c index c487ea0f4..6c88644f8 100644 --- a/apps/dtype-conv3d/kernel/dp-fconv3d.c +++ b/apps/dtype-conv3d/kernel/dp-fconv3d.c @@ -53,8 +53,8 @@ extern int64_t event_trigger; // Verify the matrices -int dp_fconv3d_verify(double *matrix, double *golden_matrix, int64_t R, int64_t C, - double threshold) { +int dp_fconv3d_verify(double *matrix, double *golden_matrix, int64_t R, + int64_t C, double threshold) { for (int r = 0; r < R; ++r) for (int c = 0; c < C; ++c) if (!similarity_check(matrix[c + C * r], golden_matrix[c + C * r], @@ -67,7 +67,7 @@ int dp_fconv3d_verify(double *matrix, double *golden_matrix, int64_t R, int64_t } void dp_fconv3d_CHx7x7(double *o, double *i, double *f, int64_t M, int64_t N, - int64_t C, int64_t F) { + int64_t C, int64_t F) { unsigned long int block_size_n; @@ -89,8 +89,8 @@ void dp_fconv3d_CHx7x7(double *o, double *i, double *f, int64_t M, int64_t N, } } -void dp_fconv3d_CHx7x7_warm(double *o, double *i, double *f, int64_t M, int64_t N, - int64_t C, int64_t F) { +void dp_fconv3d_CHx7x7_warm(double *o, double *i, double *f, int64_t M, + int64_t N, int64_t C, int64_t F) { unsigned long int block_size_n; @@ -112,8 +112,8 @@ void dp_fconv3d_CHx7x7_warm(double *o, double *i, double *f, int64_t M, int64_t } } -void dp_fconv3d_CHx7x7_block(double *o, double *i, double *f, int64_t M, int64_t N, - int64_t n_, int64_t C, int64_t F) { +void dp_fconv3d_CHx7x7_block(double *o, double *i, double *f, int64_t M, + int64_t N, int64_t n_, int64_t C, int64_t F) { // Helper variables int64_t ldo = N << 3; @@ -921,7 +921,7 @@ void dp_fconv3d_CHx7x7_block(double *o, double *i, double *f, int64_t M, int64_t } void dp_fconv3d_warm(double *o, double *i, double *f, int64_t M, int64_t N, - int64_t n_, int64_t C, int64_t F) { + int64_t n_, int64_t C, int64_t F) { // Helper variables int64_t ldo = N << 3; diff --git a/apps/dtype-conv3d/kernel/dp-fconv3d.h b/apps/dtype-conv3d/kernel/dp-fconv3d.h index 30f0955f8..20fe286d1 100644 --- a/apps/dtype-conv3d/kernel/dp-fconv3d.h +++ b/apps/dtype-conv3d/kernel/dp-fconv3d.h @@ -19,10 +19,10 @@ #ifndef FCONV3D_H #define FCONV3D_H +#include "printf.h" +#include "util.h" #include #include -#include "util.h" -#include "printf.h" // Threshold for FP numbers comparison during the final check #define THRESHOLD 0.000000000001 @@ -34,19 +34,19 @@ #define DATA_WIDTH "float64" void dp_fconv3d_CHx7x7(double *o, double *i, double *f, int64_t M, int64_t N, - int64_t C, int64_t F); + int64_t C, int64_t F); -void dp_fconv3d_CHx7x7_block(double *o, double *i, double *f, int64_t M, int64_t N, - int64_t n_, int64_t C, int64_t F); +void dp_fconv3d_CHx7x7_block(double *o, double *i, double *f, int64_t M, + int64_t N, int64_t n_, int64_t C, int64_t F); -void dp_fconv3d_CHx7x7_warm(double *o, double *i, double *f, int64_t M, int64_t N, - int64_t C, int64_t F); +void dp_fconv3d_CHx7x7_warm(double *o, double *i, double *f, int64_t M, + int64_t N, int64_t C, int64_t F); void dp_fconv3d_warm(double *o, double *i, double *f, int64_t M, int64_t N, - int64_t n_, int64_t C, int64_t F); + int64_t n_, int64_t C, int64_t F); // Verify the matrices -int dp_fconv3d_verify(double *matrix, double *golden_matrix, int64_t R, int64_t C, - double threshold); +int dp_fconv3d_verify(double *matrix, double *golden_matrix, int64_t R, + int64_t C, double threshold); #endif diff --git a/apps/dtype-conv3d/kernel/dp-iconv3d.c b/apps/dtype-conv3d/kernel/dp-iconv3d.c index 3fe5a5595..2a0441786 100644 --- a/apps/dtype-conv3d/kernel/dp-iconv3d.c +++ b/apps/dtype-conv3d/kernel/dp-iconv3d.c @@ -53,11 +53,11 @@ Change vse64.v and store instructions. Adjust pointer arithmetic: Adjust the stride values for pointer increments (ldo, ldi_pad) - ldo (likely stands for "load output"): This is the stride value used to move to the next row in the output matrix. - ldi_pad (likely stands for "load input padded"): This is the stride value used to move to the next row in the padded input matrix. - Adjust data type in arithmetic instructions: - vfmacc.vf for float, vmacc.vx for int - Same for add and slidedown instruction + ldo (likely stands for "load output"): This is the stride value used to move + to the next row in the output matrix. ldi_pad (likely stands for "load input + padded"): This is the stride value used to move to the next row in the padded + input matrix. Adjust data type in arithmetic instructions: vfmacc.vf for + float, vmacc.vx for int Same for add and slidedown instruction */ #include "dp-iconv3d.h" @@ -65,8 +65,8 @@ extern int64_t event_trigger; // Verify the matrices -int dp_iconv3d_verify(int64_t *matrix, int64_t *golden_matrix, int64_t R, int64_t C, - int64_t threshold) { +int dp_iconv3d_verify(int64_t *matrix, int64_t *golden_matrix, int64_t R, + int64_t C, int64_t threshold) { for (int r = 0; r < R; ++r) for (int c = 0; c < C; ++c) if (!similarity_check(matrix[c + C * r], golden_matrix[c + C * r], @@ -79,7 +79,7 @@ int dp_iconv3d_verify(int64_t *matrix, int64_t *golden_matrix, int64_t R, int64_ } void dp_iconv3d_CHx7x7(int64_t *o, int64_t *i, int64_t *f, int64_t M, int64_t N, - int64_t C, int64_t F) { + int64_t C, int64_t F) { unsigned long int block_size_n; @@ -101,8 +101,8 @@ void dp_iconv3d_CHx7x7(int64_t *o, int64_t *i, int64_t *f, int64_t M, int64_t N, } } -void dp_iconv3d_CHx7x7_warm(int64_t *o, int64_t *i, int64_t *f, int64_t M, int64_t N, - int64_t C, int64_t F) { +void dp_iconv3d_CHx7x7_warm(int64_t *o, int64_t *i, int64_t *f, int64_t M, + int64_t N, int64_t C, int64_t F) { unsigned long int block_size_n; @@ -124,8 +124,8 @@ void dp_iconv3d_CHx7x7_warm(int64_t *o, int64_t *i, int64_t *f, int64_t M, int64 } } -void dp_iconv3d_CHx7x7_block(int64_t *o, int64_t *i, int64_t *f, int64_t M, int64_t N, - int64_t n_, int64_t C, int64_t F) { +void dp_iconv3d_CHx7x7_block(int64_t *o, int64_t *i, int64_t *f, int64_t M, + int64_t N, int64_t n_, int64_t C, int64_t F) { // Helper variables int64_t ldo = N << 3; @@ -930,9 +930,8 @@ void dp_iconv3d_CHx7x7_block(int64_t *o, int64_t *i, int64_t *f, int64_t M, int6 asm volatile("vse64.v v28, (%0); add %0, %0, %1" : "+&r"(o) : "r"(ldo)); } - void dp_iconv3d_warm(int64_t *o, int64_t *i, int64_t *f, int64_t M, int64_t N, - int64_t n_, int64_t C, int64_t F) { + int64_t n_, int64_t C, int64_t F) { // Helper variables int64_t ldo = N << 2; @@ -1029,7 +1028,6 @@ void dp_iconv3d_warm(int64_t *o, int64_t *i, int64_t *f, int64_t M, int64_t N, i_slide_ptr_1 = i__ + n_ + 1 * (N + F - 1); i_slide_ptr_2 = i__ + n_ + 2 * (N + F - 1); - // Main kernel, unrolled by 2 for (int k = 0; k < F / 2; ++k) { // Two base indexes because of the unrolling @@ -1065,7 +1063,6 @@ void dp_iconv3d_warm(int64_t *o, int64_t *i, int64_t *f, int64_t M, int64_t N, asm volatile("vslide1down.vx v6, v4, %0" ::"r"(*i_slide_ptr_1++)); asm volatile("vslide1down.vx v10, v8, %0" ::"r"(*i_slide_ptr_2++)); - } // The very last iterations require mixing the instructions with the store @@ -1082,7 +1079,6 @@ void dp_iconv3d_warm(int64_t *o, int64_t *i, int64_t *f, int64_t M, int64_t N, // Reuse preloaded coefficients // Buffer the next coefficients for faster use - // Bump the input ptr i_ += 3 * (N + F - 1); @@ -1147,43 +1143,41 @@ void dp_iconv3d_warm(int64_t *o, int64_t *i, int64_t *f, int64_t M, int64_t N, if (ch != C - 1) { int64_t base_idx_0 = (ch + 1) * fch_len; - } } - } + } - // Bump the input ptr - i_ += N + F - 1; + // Bump the input ptr + i_ += N + F - 1; #ifdef VCD_DUMP - // Stop dumping VCD - event_trigger = -1; + // Stop dumping VCD + event_trigger = -1; #endif - ////////////// - // UNROLL 1 // - ////////////// + ////////////// + // UNROLL 1 // + ////////////// - // Loop on the channels - for (int ch = 0; ch < C; ++ch) { + // Loop on the channels + for (int ch = 0; ch < C; ++ch) { - // Point to the first element of the channel ch - i__ = i_ + ch * ich_len; + // Point to the first element of the channel ch + i__ = i_ + ch * ich_len; - // Start calculating the next pointers to the elements to be slided in - i_slide_ptr_1 = i__ + n_; + // Start calculating the next pointers to the elements to be slided in + i_slide_ptr_1 = i__ + n_; - for (int k = 0; k < F / 2; ++k) { - // Two base indexes because of the unrolling - // Point to the first element of the current column (k) of the current - // channel (ch) of the filter (f) - int64_t base_idx_0 = (2 * k + 2) + (ch * fch_len); - // Point to the first element of the current column (k+1) of the current - // channel (ch) of the filter (f) - int64_t base_idx_1 = (2 * k + 1) + (ch * fch_len); + for (int k = 0; k < F / 2; ++k) { + // Two base indexes because of the unrolling + // Point to the first element of the current column (k) of the current + // channel (ch) of the filter (f) + int64_t base_idx_0 = (2 * k + 2) + (ch * fch_len); + // Point to the first element of the current column (k+1) of the current + // channel (ch) of the filter (f) + int64_t base_idx_1 = (2 * k + 1) + (ch * fch_len); } - // Bump the input ptr i_ += N + F - 1; } @@ -1219,7 +1213,7 @@ void dp_iconv3d_warm(int64_t *o, int64_t *i, int64_t *f, int64_t M, int64_t N, if ((k | ch) == 0) asm volatile("vmul.vx v28, v0, %0" ::"r"(f[0 + base_idx_0])); else - asm volatile("vslide1down.vx v6, v4, %0" ::"r"(*i_slide_ptr_1++)); + asm volatile("vslide1down.vx v6, v4, %0" ::"r"(*i_slide_ptr_1++)); asm volatile("vslide1down.vx v10, v8, %0" ::"r"(*i_slide_ptr_2++)); asm volatile("vslide1down.vx v14, v12, %0" ::"r"(*i_slide_ptr_3++)); @@ -1228,10 +1222,8 @@ void dp_iconv3d_warm(int64_t *o, int64_t *i, int64_t *f, int64_t M, int64_t N, asm volatile("vslide1down.vx v8, v10, %0" ::"r"(*i_slide_ptr_2++)); asm volatile("vslide1down.vx v12, v14, %0" ::"r"(*i_slide_ptr_3++)); } - } - // Bump the input ptr i_ += 4 * (N + F - 1); @@ -1274,7 +1266,6 @@ void dp_iconv3d_warm(int64_t *o, int64_t *i, int64_t *f, int64_t M, int64_t N, } } - /* //////////////////// // MAIN ALGOMITHM // diff --git a/apps/dtype-conv3d/kernel/dp-iconv3d.h b/apps/dtype-conv3d/kernel/dp-iconv3d.h index bda3ad761..672b522fa 100644 --- a/apps/dtype-conv3d/kernel/dp-iconv3d.h +++ b/apps/dtype-conv3d/kernel/dp-iconv3d.h @@ -19,10 +19,10 @@ #ifndef ICONV3D_H #define ICONV3D_H +#include "printf.h" +#include "util.h" #include #include -#include "util.h" -#include "printf.h" // Threshold for FP numbers comparison during the final check #define THRESHOLD 0 @@ -34,19 +34,19 @@ #define DATA_WIDTH "int64" void dp_iconv3d_CHx7x7(int64_t *o, int64_t *i, int64_t *f, int64_t M, int64_t N, - int64_t C, int64_t F); + int64_t C, int64_t F); -void dp_iconv3d_CHx7x7_block(int64_t *o, int64_t *i, int64_t *f, int64_t M, int64_t N, - int64_t n_, int64_t C, int64_t F); +void dp_iconv3d_CHx7x7_block(int64_t *o, int64_t *i, int64_t *f, int64_t M, + int64_t N, int64_t n_, int64_t C, int64_t F); -void dp_iconv3d_CHx7x7_warm(int64_t *o, int64_t *i, int64_t *f, int64_t M, int64_t N, - int64_t C, int64_t F); +void dp_iconv3d_CHx7x7_warm(int64_t *o, int64_t *i, int64_t *f, int64_t M, + int64_t N, int64_t C, int64_t F); void dp_iconv3d_warm(int64_t *o, int64_t *i, int64_t *f, int64_t M, int64_t N, - int64_t n_, int64_t C, int64_t F); + int64_t n_, int64_t C, int64_t F); // Verify the matrices -int dp_iconv3d_verify(int64_t *matrix, int64_t *golden_matrix, int64_t R, int64_t C, - int64_t threshold); +int dp_iconv3d_verify(int64_t *matrix, int64_t *golden_matrix, int64_t R, + int64_t C, int64_t threshold); #endif diff --git a/apps/dtype-conv3d/kernel/hp-fconv3d.c b/apps/dtype-conv3d/kernel/hp-fconv3d.c index 0219a2f49..c734f7498 100644 --- a/apps/dtype-conv3d/kernel/hp-fconv3d.c +++ b/apps/dtype-conv3d/kernel/hp-fconv3d.c @@ -51,11 +51,11 @@ Change vse64.v and store instructions. Adjust pointer arithmetic: Adjust the stride values for pointer increments (ldo, ldi_pad) - ldo (likely stands for "load output"): This is the stride value used to move to the next row in the output matrix. - ldi_pad (likely stands for "load input padded"): This is the stride value used to move to the next row in the padded input matrix. - Adjust data type in arithmetic instructions: - vfmacc.vf for float, vmacc.vx for int - Same for add and slidedown instruction + ldo (likely stands for "load output"): This is the stride value used to move + to the next row in the output matrix. ldi_pad (likely stands for "load input + padded"): This is the stride value used to move to the next row in the padded + input matrix. Adjust data type in arithmetic instructions: vfmacc.vf for + float, vmacc.vx for int Same for add and slidedown instruction */ #include "hp-fconv3d.h" @@ -63,21 +63,21 @@ extern int64_t event_trigger; // Verify the matrices -int hp_fconv3d_verify(_Float16 *matrix, _Float16 *golden_matrix, int64_t R, int64_t C, - _Float16 threshold) { +int hp_fconv3d_verify(_Float16 *matrix, _Float16 *golden_matrix, int64_t R, + int64_t C, _Float16 threshold) { for (int r = 0; r < R; ++r) for (int c = 0; c < C; ++c) if (!similarity_check(matrix[c + C * r], golden_matrix[c + C * r], threshold)) { printf("Error: o[%d][%d] = %f, instead of %f\n", r, c, - (float) matrix[c + C * r], (float) golden_matrix[c + C * r]); + (float)matrix[c + C * r], (float)golden_matrix[c + C * r]); return 1; } return 0; } -void hp_fconv3d_CHx7x7(_Float16 *o, _Float16 *i, _Float16 *f, int64_t M, int64_t N, - int64_t C, int64_t F) { +void hp_fconv3d_CHx7x7(_Float16 *o, _Float16 *i, _Float16 *f, int64_t M, + int64_t N, int64_t C, int64_t F) { unsigned long int block_size_n; @@ -99,8 +99,8 @@ void hp_fconv3d_CHx7x7(_Float16 *o, _Float16 *i, _Float16 *f, int64_t M, int64_t } } -void hp_fconv3d_CHx7x7_warm(_Float16 *o, _Float16 *i, _Float16 *f, int64_t M, int64_t N, - int64_t C, int64_t F) { +void hp_fconv3d_CHx7x7_warm(_Float16 *o, _Float16 *i, _Float16 *f, int64_t M, + int64_t N, int64_t C, int64_t F) { unsigned long int block_size_n; @@ -122,8 +122,8 @@ void hp_fconv3d_CHx7x7_warm(_Float16 *o, _Float16 *i, _Float16 *f, int64_t M, in } } -void hp_fconv3d_CHx7x7_block(_Float16 *o, _Float16 *i, _Float16 *f, int64_t M, int64_t N, - int64_t n_, int64_t C, int64_t F) { +void hp_fconv3d_CHx7x7_block(_Float16 *o, _Float16 *i, _Float16 *f, int64_t M, + int64_t N, int64_t n_, int64_t C, int64_t F) { // Helper variables int64_t ldo = N << 1; @@ -929,9 +929,8 @@ void hp_fconv3d_CHx7x7_block(_Float16 *o, _Float16 *i, _Float16 *f, int64_t M, i asm volatile("vse16.v v28, (%0); add %0, %0, %1" : "+&r"(o) : "r"(ldo)); } - -void hp_fconv3d_warm(_Float16 *o, _Float16 *i, _Float16 *f, int64_t M, int64_t N, - int64_t n_, int64_t C, int64_t F) { +void hp_fconv3d_warm(_Float16 *o, _Float16 *i, _Float16 *f, int64_t M, + int64_t N, int64_t n_, int64_t C, int64_t F) { // Helper variables int64_t ldo = N << 1; @@ -1028,7 +1027,6 @@ void hp_fconv3d_warm(_Float16 *o, _Float16 *i, _Float16 *f, int64_t M, int64_t N i_slide_ptr_1 = i__ + n_ + 1 * (N + F - 1); i_slide_ptr_2 = i__ + n_ + 2 * (N + F - 1); - // Main kernel, unrolled by 2 for (int k = 0; k < F / 2; ++k) { // Two base indexes because of the unrolling @@ -1064,7 +1062,6 @@ void hp_fconv3d_warm(_Float16 *o, _Float16 *i, _Float16 *f, int64_t M, int64_t N asm volatile("vfslide1down.vf v6, v4, %0" ::"f"(*i_slide_ptr_1++)); asm volatile("vfslide1down.vf v10, v8, %0" ::"f"(*i_slide_ptr_2++)); - } // The very last iterations require mixing the instructions with the store @@ -1081,7 +1078,6 @@ void hp_fconv3d_warm(_Float16 *o, _Float16 *i, _Float16 *f, int64_t M, int64_t N // Reuse preloaded coefficients // Buffer the next coefficients for faster use - // Bump the input ptr i_ += 3 * (N + F - 1); @@ -1146,43 +1142,41 @@ void hp_fconv3d_warm(_Float16 *o, _Float16 *i, _Float16 *f, int64_t M, int64_t N if (ch != C - 1) { int64_t base_idx_0 = (ch + 1) * fch_len; - } } - } + } - // Bump the input ptr - i_ += N + F - 1; + // Bump the input ptr + i_ += N + F - 1; #ifdef VCD_DUMP - // Stop dumping VCD - event_trigger = -1; + // Stop dumping VCD + event_trigger = -1; #endif - ////////////// - // UNROLL 1 // - ////////////// + ////////////// + // UNROLL 1 // + ////////////// - // Loop on the channels - for (int ch = 0; ch < C; ++ch) { + // Loop on the channels + for (int ch = 0; ch < C; ++ch) { - // Point to the first element of the channel ch - i__ = i_ + ch * ich_len; + // Point to the first element of the channel ch + i__ = i_ + ch * ich_len; - // Start calculating the next pointers to the elements to be slided in - i_slide_ptr_1 = i__ + n_; + // Start calculating the next pointers to the elements to be slided in + i_slide_ptr_1 = i__ + n_; - for (int k = 0; k < F / 2; ++k) { - // Two base indexes because of the unrolling - // Point to the first element of the current column (k) of the current - // channel (ch) of the filter (f) - int64_t base_idx_0 = (2 * k + 2) + (ch * fch_len); - // Point to the first element of the current column (k+1) of the current - // channel (ch) of the filter (f) - int64_t base_idx_1 = (2 * k + 1) + (ch * fch_len); + for (int k = 0; k < F / 2; ++k) { + // Two base indexes because of the unrolling + // Point to the first element of the current column (k) of the current + // channel (ch) of the filter (f) + int64_t base_idx_0 = (2 * k + 2) + (ch * fch_len); + // Point to the first element of the current column (k+1) of the current + // channel (ch) of the filter (f) + int64_t base_idx_1 = (2 * k + 1) + (ch * fch_len); } - // Bump the input ptr i_ += N + F - 1; } @@ -1218,7 +1212,7 @@ void hp_fconv3d_warm(_Float16 *o, _Float16 *i, _Float16 *f, int64_t M, int64_t N if ((k | ch) == 0) asm volatile("vfmul.vf v28, v0, %0" ::"f"(f[0 + base_idx_0])); else - asm volatile("vfslide1down.vf v6, v4, %0" ::"f"(*i_slide_ptr_1++)); + asm volatile("vfslide1down.vf v6, v4, %0" ::"f"(*i_slide_ptr_1++)); asm volatile("vfslide1down.vf v10, v8, %0" ::"f"(*i_slide_ptr_2++)); asm volatile("vfslide1down.vf v14, v12, %0" ::"f"(*i_slide_ptr_3++)); @@ -1227,10 +1221,8 @@ void hp_fconv3d_warm(_Float16 *o, _Float16 *i, _Float16 *f, int64_t M, int64_t N asm volatile("vfslide1down.vf v8, v10, %0" ::"f"(*i_slide_ptr_2++)); asm volatile("vfslide1down.vf v12, v14, %0" ::"f"(*i_slide_ptr_3++)); } - } - // Bump the input ptr i_ += 4 * (N + F - 1); @@ -1273,7 +1265,6 @@ void hp_fconv3d_warm(_Float16 *o, _Float16 *i, _Float16 *f, int64_t M, int64_t N } } - /* //////////////////// // MAIN ALGOMITHM // diff --git a/apps/dtype-conv3d/kernel/hp-fconv3d.h b/apps/dtype-conv3d/kernel/hp-fconv3d.h index 8c3aefb1e..76ed914b0 100644 --- a/apps/dtype-conv3d/kernel/hp-fconv3d.h +++ b/apps/dtype-conv3d/kernel/hp-fconv3d.h @@ -19,10 +19,10 @@ #ifndef FCONV3D_H #define FCONV3D_H -#include -#include #include "printf.h" #include "util.h" +#include +#include // Threshold for FP numbers comparison during the final check #define THRESHOLD 1 @@ -33,20 +33,20 @@ #define DTYPE_PREFIX "HP" #define DATA_WIDTH "float16" -void hp_fconv3d_CHx7x7(_Float16 *o, _Float16 *i, _Float16 *f, int64_t M, int64_t N, - int64_t C, int64_t F); +void hp_fconv3d_CHx7x7(_Float16 *o, _Float16 *i, _Float16 *f, int64_t M, + int64_t N, int64_t C, int64_t F); -void hp_fconv3d_CHx7x7_block(_Float16 *o, _Float16 *i, _Float16 *f, int64_t M, int64_t N, - int64_t n_, int64_t C, int64_t F); +void hp_fconv3d_CHx7x7_block(_Float16 *o, _Float16 *i, _Float16 *f, int64_t M, + int64_t N, int64_t n_, int64_t C, int64_t F); -void hp_fconv3d_CHx7x7_warm(_Float16 *o, _Float16 *i, _Float16 *f, int64_t M, int64_t N, - int64_t C, int64_t F); +void hp_fconv3d_CHx7x7_warm(_Float16 *o, _Float16 *i, _Float16 *f, int64_t M, + int64_t N, int64_t C, int64_t F); -void hp_fconv3d_warm(_Float16 *o, _Float16 *i, _Float16 *f, int64_t M, int64_t N, - int64_t n_, int64_t C, int64_t F); +void hp_fconv3d_warm(_Float16 *o, _Float16 *i, _Float16 *f, int64_t M, + int64_t N, int64_t n_, int64_t C, int64_t F); // Verify the matrices -int hp_fconv3d_verify(_Float16 *matrix, _Float16 *golden_matrix, int64_t R, int64_t C, - _Float16 threshold); +int hp_fconv3d_verify(_Float16 *matrix, _Float16 *golden_matrix, int64_t R, + int64_t C, _Float16 threshold); #endif diff --git a/apps/dtype-conv3d/kernel/hp-iconv3d.c b/apps/dtype-conv3d/kernel/hp-iconv3d.c index 46b33c73c..9fb180278 100644 --- a/apps/dtype-conv3d/kernel/hp-iconv3d.c +++ b/apps/dtype-conv3d/kernel/hp-iconv3d.c @@ -51,19 +51,19 @@ Change vse64.v and store instructions. Adjust pointer arithmetic: Adjust the stride values for pointer increments (ldo, ldi_pad) - ldo (likely stands for "load output"): This is the stride value used to move to the next row in the output matrix. - ldi_pad (likely stands for "load input padded"): This is the stride value used to move to the next row in the padded input matrix. - Adjust data type in arithmetic instructions: - vfmacc.vf for float, vmacc.vx for int - Same for add and slidedown instruction + ldo (likely stands for "load output"): This is the stride value used to move + to the next row in the output matrix. ldi_pad (likely stands for "load input + padded"): This is the stride value used to move to the next row in the padded + input matrix. Adjust data type in arithmetic instructions: vfmacc.vf for + float, vmacc.vx for int Same for add and slidedown instruction */ #include "hp-iconv3d.h" extern int64_t event_trigger; -int hp_iconv3d_verify(int16_t *matrix, int16_t *golden_matrix, int64_t R, int64_t C, - int64_t threshold) { +int hp_iconv3d_verify(int16_t *matrix, int16_t *golden_matrix, int64_t R, + int64_t C, int64_t threshold) { for (int r = 0; r < R; ++r) for (int c = 0; c < C; ++c) if (!similarity_check(matrix[c + C * r], golden_matrix[c + C * r], @@ -76,7 +76,7 @@ int hp_iconv3d_verify(int16_t *matrix, int16_t *golden_matrix, int64_t R, int64_ } void hp_iconv3d_CHx7x7(int16_t *o, int16_t *i, int16_t *f, int64_t M, int64_t N, - int64_t C, int64_t F) { + int64_t C, int64_t F) { unsigned long int block_size_n; @@ -98,8 +98,8 @@ void hp_iconv3d_CHx7x7(int16_t *o, int16_t *i, int16_t *f, int64_t M, int64_t N, } } -void hp_iconv3d_CHx7x7_warm(int16_t *o, int16_t *i, int16_t *f, int64_t M, int64_t N, - int64_t C, int64_t F) { +void hp_iconv3d_CHx7x7_warm(int16_t *o, int16_t *i, int16_t *f, int64_t M, + int64_t N, int64_t C, int64_t F) { unsigned long int block_size_n; @@ -121,8 +121,8 @@ void hp_iconv3d_CHx7x7_warm(int16_t *o, int16_t *i, int16_t *f, int64_t M, int64 } } -void hp_iconv3d_CHx7x7_block(int16_t *o, int16_t *i, int16_t *f, int64_t M, int64_t N, - int64_t n_, int64_t C, int64_t F) { +void hp_iconv3d_CHx7x7_block(int16_t *o, int16_t *i, int16_t *f, int64_t M, + int64_t N, int64_t n_, int64_t C, int64_t F) { // Helper variables int64_t ldo = N << 1; @@ -928,9 +928,8 @@ void hp_iconv3d_CHx7x7_block(int16_t *o, int16_t *i, int16_t *f, int64_t M, int6 asm volatile("vse16.v v28, (%0); add %0, %0, %1" : "+&r"(o) : "r"(ldo)); } - void hp_iconv3d_warm(int16_t *o, int16_t *i, int16_t *f, int64_t M, int64_t N, - int64_t n_, int64_t C, int64_t F) { + int64_t n_, int64_t C, int64_t F) { // Helper variables int64_t ldo = N << 1; @@ -1027,7 +1026,6 @@ void hp_iconv3d_warm(int16_t *o, int16_t *i, int16_t *f, int64_t M, int64_t N, i_slide_ptr_1 = i__ + n_ + 1 * (N + F - 1); i_slide_ptr_2 = i__ + n_ + 2 * (N + F - 1); - // Main kernel, unrolled by 2 for (int k = 0; k < F / 2; ++k) { // Two base indexes because of the unrolling @@ -1063,7 +1061,6 @@ void hp_iconv3d_warm(int16_t *o, int16_t *i, int16_t *f, int64_t M, int64_t N, asm volatile("vslide1down.vx v6, v4, %0" ::"r"(*i_slide_ptr_1++)); asm volatile("vslide1down.vx v10, v8, %0" ::"r"(*i_slide_ptr_2++)); - } // The very last iterations require mixing the instructions with the store @@ -1080,7 +1077,6 @@ void hp_iconv3d_warm(int16_t *o, int16_t *i, int16_t *f, int64_t M, int64_t N, // Reuse preloaded coefficients // Buffer the next coefficients for faster use - // Bump the input ptr i_ += 3 * (N + F - 1); @@ -1145,43 +1141,41 @@ void hp_iconv3d_warm(int16_t *o, int16_t *i, int16_t *f, int64_t M, int64_t N, if (ch != C - 1) { int64_t base_idx_0 = (ch + 1) * fch_len; - } } - } + } - // Bump the input ptr - i_ += N + F - 1; + // Bump the input ptr + i_ += N + F - 1; #ifdef VCD_DUMP - // Stop dumping VCD - event_trigger = -1; + // Stop dumping VCD + event_trigger = -1; #endif - ////////////// - // UNROLL 1 // - ////////////// + ////////////// + // UNROLL 1 // + ////////////// - // Loop on the channels - for (int ch = 0; ch < C; ++ch) { + // Loop on the channels + for (int ch = 0; ch < C; ++ch) { - // Point to the first element of the channel ch - i__ = i_ + ch * ich_len; + // Point to the first element of the channel ch + i__ = i_ + ch * ich_len; - // Start calculating the next pointers to the elements to be slided in - i_slide_ptr_1 = i__ + n_; + // Start calculating the next pointers to the elements to be slided in + i_slide_ptr_1 = i__ + n_; - for (int k = 0; k < F / 2; ++k) { - // Two base indexes because of the unrolling - // Point to the first element of the current column (k) of the current - // channel (ch) of the filter (f) - int64_t base_idx_0 = (2 * k + 2) + (ch * fch_len); - // Point to the first element of the current column (k+1) of the current - // channel (ch) of the filter (f) - int64_t base_idx_1 = (2 * k + 1) + (ch * fch_len); + for (int k = 0; k < F / 2; ++k) { + // Two base indexes because of the unrolling + // Point to the first element of the current column (k) of the current + // channel (ch) of the filter (f) + int64_t base_idx_0 = (2 * k + 2) + (ch * fch_len); + // Point to the first element of the current column (k+1) of the current + // channel (ch) of the filter (f) + int64_t base_idx_1 = (2 * k + 1) + (ch * fch_len); } - // Bump the input ptr i_ += N + F - 1; } @@ -1217,7 +1211,7 @@ void hp_iconv3d_warm(int16_t *o, int16_t *i, int16_t *f, int64_t M, int64_t N, if ((k | ch) == 0) asm volatile("vmul.vx v28, v0, %0" ::"r"(f[0 + base_idx_0])); else - asm volatile("vslide1down.vx v6, v4, %0" ::"r"(*i_slide_ptr_1++)); + asm volatile("vslide1down.vx v6, v4, %0" ::"r"(*i_slide_ptr_1++)); asm volatile("vslide1down.vx v10, v8, %0" ::"r"(*i_slide_ptr_2++)); asm volatile("vslide1down.vx v14, v12, %0" ::"r"(*i_slide_ptr_3++)); @@ -1226,10 +1220,8 @@ void hp_iconv3d_warm(int16_t *o, int16_t *i, int16_t *f, int64_t M, int64_t N, asm volatile("vslide1down.vx v8, v10, %0" ::"r"(*i_slide_ptr_2++)); asm volatile("vslide1down.vx v12, v14, %0" ::"r"(*i_slide_ptr_3++)); } - } - // Bump the input ptr i_ += 4 * (N + F - 1); @@ -1272,7 +1264,6 @@ void hp_iconv3d_warm(int16_t *o, int16_t *i, int16_t *f, int64_t M, int64_t N, } } - /* //////////////////// // MAIN ALGOMITHM // diff --git a/apps/dtype-conv3d/kernel/hp-iconv3d.h b/apps/dtype-conv3d/kernel/hp-iconv3d.h index f10950489..2727a0057 100644 --- a/apps/dtype-conv3d/kernel/hp-iconv3d.h +++ b/apps/dtype-conv3d/kernel/hp-iconv3d.h @@ -19,10 +19,10 @@ #ifndef ICONV3D_H #define ICONV3D_H +#include "printf.h" +#include "util.h" #include #include -#include "util.h" -#include "printf.h" // Threshold for FP numbers comparison during the final check #define THRESHOLD 0 @@ -34,19 +34,19 @@ #define DATA_WIDTH "int16" void hp_iconv3d_CHx7x7(int16_t *o, int16_t *i, int16_t *f, int64_t M, int64_t N, - int64_t C, int64_t F); + int64_t C, int64_t F); -void hp_iconv3d_CHx7x7_block(int16_t *o, int16_t *i, int16_t *f, int64_t M, int64_t N, - int64_t n_, int64_t C, int64_t F); +void hp_iconv3d_CHx7x7_block(int16_t *o, int16_t *i, int16_t *f, int64_t M, + int64_t N, int64_t n_, int64_t C, int64_t F); -void hp_iconv3d_CHx7x7_warm(int16_t *o, int16_t *i, int16_t *f, int64_t M, int64_t N, - int64_t C, int64_t F); +void hp_iconv3d_CHx7x7_warm(int16_t *o, int16_t *i, int16_t *f, int64_t M, + int64_t N, int64_t C, int64_t F); void hp_iconv3d_warm(int16_t *o, int16_t *i, int16_t *f, int64_t M, int64_t N, - int64_t n_, int64_t C, int64_t F); + int64_t n_, int64_t C, int64_t F); // Verify the matrices -int hp_iconv3d_verify(int16_t *matrix, int16_t *golden_matrix, int64_t R, int64_t C, - int64_t threshold); +int hp_iconv3d_verify(int16_t *matrix, int16_t *golden_matrix, int64_t R, + int64_t C, int64_t threshold); #endif diff --git a/apps/dtype-conv3d/kernel/sp-fconv3d.c b/apps/dtype-conv3d/kernel/sp-fconv3d.c index 095331ce8..5625d23a6 100644 --- a/apps/dtype-conv3d/kernel/sp-fconv3d.c +++ b/apps/dtype-conv3d/kernel/sp-fconv3d.c @@ -51,11 +51,11 @@ Change vse64.v and store instructions. Adjust pointer arithmetic: Adjust the stride values for pointer increments (ldo, ldi_pad) - ldo (likely stands for "load output"): This is the stride value used to move to the next row in the output matrix. - ldi_pad (likely stands for "load input padded"): This is the stride value used to move to the next row in the padded input matrix. - Adjust data type in arithmetic instructions: - vfmacc.vf for float, vmacc.vx for int - Same for add and slidedown instruction + ldo (likely stands for "load output"): This is the stride value used to move + to the next row in the output matrix. ldi_pad (likely stands for "load input + padded"): This is the stride value used to move to the next row in the padded + input matrix. Adjust data type in arithmetic instructions: vfmacc.vf for + float, vmacc.vx for int Same for add and slidedown instruction */ #include "sp-fconv3d.h" @@ -65,7 +65,7 @@ extern int64_t event_trigger; // Verify the matrices int sp_fconv3d_verify(float *matrix, float *golden_matrix, int64_t R, int64_t C, - float threshold) { + float threshold) { for (int r = 0; r < R; ++r) for (int c = 0; c < C; ++c) if (!similarity_check(matrix[c + C * r], golden_matrix[c + C * r], @@ -78,24 +78,27 @@ int sp_fconv3d_verify(float *matrix, float *golden_matrix, int64_t R, int64_t C, } /* -int sp_fconv3d_verify(float *matrix, float *golden_matrix, int64_t R, int64_t C, float threshold) { - for (int64_t r = 0; r < R; ++r) { - for (int64_t c = 0; c < C; ++c) { +int sp_fconv3d_verify(float *matrix, float *golden_matrix, int64_t R, int64_t C, +float threshold) { for (int64_t r = 0; r < R; ++r) { for (int64_t c = 0; c < C; +++c) { // if (1) { - if (!similarity_check(matrix[c + C * r], golden_matrix[c + C * r], threshold)) { + if (!similarity_check(matrix[c + C * r], golden_matrix[c + C * r], +threshold)) { // Convert double to integer parts for matrix value int32_t mat_integer_part = (int32_t)matrix[c + C * r]; - int32_t mat_fractional_part = (int32_t)((matrix[c + C * r] - mat_integer_part) * 1000000); - if (mat_fractional_part < 0) mat_fractional_part = -mat_fractional_part; + int32_t mat_fractional_part = (int32_t)((matrix[c + C * r] - +mat_integer_part) * 1000000); if (mat_fractional_part < 0) mat_fractional_part = +-mat_fractional_part; // Convert double to integer parts for golden matrix value int32_t gold_integer_part = (int32_t)golden_matrix[c + C * r]; - int32_t gold_fractional_part = (int32_t)((golden_matrix[c + C * r] - gold_integer_part) * 1000000); - if (gold_fractional_part < 0) gold_fractional_part = -gold_fractional_part; + int32_t gold_fractional_part = (int32_t)((golden_matrix[c + C * r] - +gold_integer_part) * 1000000); if (gold_fractional_part < 0) +gold_fractional_part = -gold_fractional_part; printf("Error: o[%lld][%lld] = %lld.%06lld, instead of %lld.%06lld\n", - r, c, mat_integer_part, mat_fractional_part, gold_integer_part, gold_fractional_part); - return 1; + r, c, mat_integer_part, mat_fractional_part, gold_integer_part, +gold_fractional_part); return 1; } } } @@ -104,7 +107,7 @@ int sp_fconv3d_verify(float *matrix, float *golden_matrix, int64_t R, int64_t C, */ void sp_fconv3d_CHx7x7(float *o, float *i, float *f, int64_t M, int64_t N, - int64_t C, int64_t F) { + int64_t C, int64_t F) { unsigned long int block_size_n; @@ -127,7 +130,7 @@ void sp_fconv3d_CHx7x7(float *o, float *i, float *f, int64_t M, int64_t N, } void sp_fconv3d_CHx7x7_warm(float *o, float *i, float *f, int64_t M, int64_t N, - int64_t C, int64_t F) { + int64_t C, int64_t F) { unsigned long int block_size_n; @@ -150,7 +153,7 @@ void sp_fconv3d_CHx7x7_warm(float *o, float *i, float *f, int64_t M, int64_t N, } void sp_fconv3d_CHx7x7_block(float *o, float *i, float *f, int64_t M, int64_t N, - int64_t n_, int64_t C, int64_t F) { + int64_t n_, int64_t C, int64_t F) { // Helper variables int64_t ldo = N << 2; @@ -956,9 +959,8 @@ void sp_fconv3d_CHx7x7_block(float *o, float *i, float *f, int64_t M, int64_t N, asm volatile("vse32.v v28, (%0); add %0, %0, %1" : "+&r"(o) : "r"(ldo)); } - void sp_fconv3d_warm(float *o, float *i, float *f, int64_t M, int64_t N, - int64_t n_, int64_t C, int64_t F) { + int64_t n_, int64_t C, int64_t F) { // Helper variables int64_t ldo = N << 2; @@ -1055,7 +1057,6 @@ void sp_fconv3d_warm(float *o, float *i, float *f, int64_t M, int64_t N, i_slide_ptr_1 = i__ + n_ + 1 * (N + F - 1); i_slide_ptr_2 = i__ + n_ + 2 * (N + F - 1); - // Main kernel, unrolled by 2 for (int k = 0; k < F / 2; ++k) { // Two base indexes because of the unrolling @@ -1091,7 +1092,6 @@ void sp_fconv3d_warm(float *o, float *i, float *f, int64_t M, int64_t N, asm volatile("vfslide1down.vf v6, v4, %0" ::"f"(*i_slide_ptr_1++)); asm volatile("vfslide1down.vf v10, v8, %0" ::"f"(*i_slide_ptr_2++)); - } // The very last iterations require mixing the instructions with the store @@ -1108,7 +1108,6 @@ void sp_fconv3d_warm(float *o, float *i, float *f, int64_t M, int64_t N, // Reuse preloaded coefficients // Buffer the next coefficients for faster use - // Bump the input ptr i_ += 3 * (N + F - 1); @@ -1173,43 +1172,41 @@ void sp_fconv3d_warm(float *o, float *i, float *f, int64_t M, int64_t N, if (ch != C - 1) { int64_t base_idx_0 = (ch + 1) * fch_len; - } } - } + } - // Bump the input ptr - i_ += N + F - 1; + // Bump the input ptr + i_ += N + F - 1; #ifdef VCD_DUMP - // Stop dumping VCD - event_trigger = -1; + // Stop dumping VCD + event_trigger = -1; #endif - ////////////// - // UNROLL 1 // - ////////////// + ////////////// + // UNROLL 1 // + ////////////// - // Loop on the channels - for (int ch = 0; ch < C; ++ch) { + // Loop on the channels + for (int ch = 0; ch < C; ++ch) { - // Point to the first element of the channel ch - i__ = i_ + ch * ich_len; + // Point to the first element of the channel ch + i__ = i_ + ch * ich_len; - // Start calculating the next pointers to the elements to be slided in - i_slide_ptr_1 = i__ + n_; + // Start calculating the next pointers to the elements to be slided in + i_slide_ptr_1 = i__ + n_; - for (int k = 0; k < F / 2; ++k) { - // Two base indexes because of the unrolling - // Point to the first element of the current column (k) of the current - // channel (ch) of the filter (f) - int64_t base_idx_0 = (2 * k + 2) + (ch * fch_len); - // Point to the first element of the current column (k+1) of the current - // channel (ch) of the filter (f) - int64_t base_idx_1 = (2 * k + 1) + (ch * fch_len); + for (int k = 0; k < F / 2; ++k) { + // Two base indexes because of the unrolling + // Point to the first element of the current column (k) of the current + // channel (ch) of the filter (f) + int64_t base_idx_0 = (2 * k + 2) + (ch * fch_len); + // Point to the first element of the current column (k+1) of the current + // channel (ch) of the filter (f) + int64_t base_idx_1 = (2 * k + 1) + (ch * fch_len); } - // Bump the input ptr i_ += N + F - 1; } @@ -1245,7 +1242,7 @@ void sp_fconv3d_warm(float *o, float *i, float *f, int64_t M, int64_t N, if ((k | ch) == 0) asm volatile("vfmul.vf v28, v0, %0" ::"f"(f[0 + base_idx_0])); else - asm volatile("vfslide1down.vf v6, v4, %0" ::"f"(*i_slide_ptr_1++)); + asm volatile("vfslide1down.vf v6, v4, %0" ::"f"(*i_slide_ptr_1++)); asm volatile("vfslide1down.vf v10, v8, %0" ::"f"(*i_slide_ptr_2++)); asm volatile("vfslide1down.vf v14, v12, %0" ::"f"(*i_slide_ptr_3++)); @@ -1254,10 +1251,8 @@ void sp_fconv3d_warm(float *o, float *i, float *f, int64_t M, int64_t N, asm volatile("vfslide1down.vf v8, v10, %0" ::"f"(*i_slide_ptr_2++)); asm volatile("vfslide1down.vf v12, v14, %0" ::"f"(*i_slide_ptr_3++)); } - } - // Bump the input ptr i_ += 4 * (N + F - 1); @@ -1300,7 +1295,6 @@ void sp_fconv3d_warm(float *o, float *i, float *f, int64_t M, int64_t N, } } - /* //////////////////// // MAIN ALGOMITHM // diff --git a/apps/dtype-conv3d/kernel/sp-fconv3d.h b/apps/dtype-conv3d/kernel/sp-fconv3d.h index fb5ae3672..9dcfb6b19 100644 --- a/apps/dtype-conv3d/kernel/sp-fconv3d.h +++ b/apps/dtype-conv3d/kernel/sp-fconv3d.h @@ -19,10 +19,10 @@ #ifndef FCONV3D_H #define FCONV3D_H -#include -#include #include "printf.h" #include "util.h" +#include +#include // Threshold for FP numbers comparison during the final check #define THRESHOLD 0.0001 @@ -34,19 +34,19 @@ #define DATA_WIDTH "float32" void sp_fconv3d_CHx7x7(float *o, float *i, float *f, int64_t M, int64_t N, - int64_t C, int64_t F); + int64_t C, int64_t F); void sp_fconv3d_CHx7x7_block(float *o, float *i, float *f, int64_t M, int64_t N, - int64_t n_, int64_t C, int64_t F); + int64_t n_, int64_t C, int64_t F); void sp_fconv3d_CHx7x7_warm(float *o, float *i, float *f, int64_t M, int64_t N, - int64_t C, int64_t F); + int64_t C, int64_t F); void sp_fconv3d_warm(float *o, float *i, float *f, int64_t M, int64_t N, - int64_t n_, int64_t C, int64_t F); + int64_t n_, int64_t C, int64_t F); // Verify the matrices int sp_fconv3d_verify(float *matrix, float *golden_matrix, int64_t R, int64_t C, - float threshold); + float threshold); #endif diff --git a/apps/dtype-conv3d/kernel/sp-iconv3d.c b/apps/dtype-conv3d/kernel/sp-iconv3d.c index 9a53ef474..621430645 100644 --- a/apps/dtype-conv3d/kernel/sp-iconv3d.c +++ b/apps/dtype-conv3d/kernel/sp-iconv3d.c @@ -51,19 +51,19 @@ Change vse64.v and store instructions. Adjust pointer arithmetic: Adjust the stride values for pointer increments (ldo, ldi_pad) - ldo (likely stands for "load output"): This is the stride value used to move to the next row in the output matrix. - ldi_pad (likely stands for "load input padded"): This is the stride value used to move to the next row in the padded input matrix. - Adjust data type in arithmetic instructions: - vfmacc.vf for float, vmacc.vx for int - Same for add and slidedown instruction + ldo (likely stands for "load output"): This is the stride value used to move + to the next row in the output matrix. ldi_pad (likely stands for "load input + padded"): This is the stride value used to move to the next row in the padded + input matrix. Adjust data type in arithmetic instructions: vfmacc.vf for + float, vmacc.vx for int Same for add and slidedown instruction */ #include "sp-iconv3d.h" extern int64_t event_trigger; -int sp_iconv3d_verify(int32_t *matrix, int32_t *golden_matrix, int64_t R, int64_t C, - int64_t threshold) { +int sp_iconv3d_verify(int32_t *matrix, int32_t *golden_matrix, int64_t R, + int64_t C, int64_t threshold) { for (int r = 0; r < R; ++r) for (int c = 0; c < C; ++c) if (!similarity_check(matrix[c + C * r], golden_matrix[c + C * r], @@ -76,7 +76,7 @@ int sp_iconv3d_verify(int32_t *matrix, int32_t *golden_matrix, int64_t R, int64_ } void sp_iconv3d_CHx7x7(int32_t *o, int32_t *i, int32_t *f, int64_t M, int64_t N, - int64_t C, int64_t F) { + int64_t C, int64_t F) { unsigned long int block_size_n; @@ -98,8 +98,8 @@ void sp_iconv3d_CHx7x7(int32_t *o, int32_t *i, int32_t *f, int64_t M, int64_t N, } } -void sp_iconv3d_CHx7x7_warm(int32_t *o, int32_t *i, int32_t *f, int64_t M, int64_t N, - int64_t C, int64_t F) { +void sp_iconv3d_CHx7x7_warm(int32_t *o, int32_t *i, int32_t *f, int64_t M, + int64_t N, int64_t C, int64_t F) { unsigned long int block_size_n; @@ -121,8 +121,8 @@ void sp_iconv3d_CHx7x7_warm(int32_t *o, int32_t *i, int32_t *f, int64_t M, int64 } } -void sp_iconv3d_CHx7x7_block(int32_t *o, int32_t *i, int32_t *f, int64_t M, int64_t N, - int64_t n_, int64_t C, int64_t F) { +void sp_iconv3d_CHx7x7_block(int32_t *o, int32_t *i, int32_t *f, int64_t M, + int64_t N, int64_t n_, int64_t C, int64_t F) { // Helper variables int64_t ldo = N << 2; @@ -928,9 +928,8 @@ void sp_iconv3d_CHx7x7_block(int32_t *o, int32_t *i, int32_t *f, int64_t M, int6 asm volatile("vse32.v v28, (%0); add %0, %0, %1" : "+&r"(o) : "r"(ldo)); } - void sp_iconv3d_warm(int32_t *o, int32_t *i, int32_t *f, int64_t M, int64_t N, - int64_t n_, int64_t C, int64_t F) { + int64_t n_, int64_t C, int64_t F) { // Helper variables int64_t ldo = N << 2; @@ -1027,7 +1026,6 @@ void sp_iconv3d_warm(int32_t *o, int32_t *i, int32_t *f, int64_t M, int64_t N, i_slide_ptr_1 = i__ + n_ + 1 * (N + F - 1); i_slide_ptr_2 = i__ + n_ + 2 * (N + F - 1); - // Main kernel, unrolled by 2 for (int k = 0; k < F / 2; ++k) { // Two base indexes because of the unrolling @@ -1063,7 +1061,6 @@ void sp_iconv3d_warm(int32_t *o, int32_t *i, int32_t *f, int64_t M, int64_t N, asm volatile("vslide1down.vx v6, v4, %0" ::"r"(*i_slide_ptr_1++)); asm volatile("vslide1down.vx v10, v8, %0" ::"r"(*i_slide_ptr_2++)); - } // The very last iterations require mixing the instructions with the store @@ -1080,7 +1077,6 @@ void sp_iconv3d_warm(int32_t *o, int32_t *i, int32_t *f, int64_t M, int64_t N, // Reuse preloaded coefficients // Buffer the next coefficients for faster use - // Bump the input ptr i_ += 3 * (N + F - 1); @@ -1145,43 +1141,41 @@ void sp_iconv3d_warm(int32_t *o, int32_t *i, int32_t *f, int64_t M, int64_t N, if (ch != C - 1) { int64_t base_idx_0 = (ch + 1) * fch_len; - } } - } + } - // Bump the input ptr - i_ += N + F - 1; + // Bump the input ptr + i_ += N + F - 1; #ifdef VCD_DUMP - // Stop dumping VCD - event_trigger = -1; + // Stop dumping VCD + event_trigger = -1; #endif - ////////////// - // UNROLL 1 // - ////////////// + ////////////// + // UNROLL 1 // + ////////////// - // Loop on the channels - for (int ch = 0; ch < C; ++ch) { + // Loop on the channels + for (int ch = 0; ch < C; ++ch) { - // Point to the first element of the channel ch - i__ = i_ + ch * ich_len; + // Point to the first element of the channel ch + i__ = i_ + ch * ich_len; - // Start calculating the next pointers to the elements to be slided in - i_slide_ptr_1 = i__ + n_; + // Start calculating the next pointers to the elements to be slided in + i_slide_ptr_1 = i__ + n_; - for (int k = 0; k < F / 2; ++k) { - // Two base indexes because of the unrolling - // Point to the first element of the current column (k) of the current - // channel (ch) of the filter (f) - int64_t base_idx_0 = (2 * k + 2) + (ch * fch_len); - // Point to the first element of the current column (k+1) of the current - // channel (ch) of the filter (f) - int64_t base_idx_1 = (2 * k + 1) + (ch * fch_len); + for (int k = 0; k < F / 2; ++k) { + // Two base indexes because of the unrolling + // Point to the first element of the current column (k) of the current + // channel (ch) of the filter (f) + int64_t base_idx_0 = (2 * k + 2) + (ch * fch_len); + // Point to the first element of the current column (k+1) of the current + // channel (ch) of the filter (f) + int64_t base_idx_1 = (2 * k + 1) + (ch * fch_len); } - // Bump the input ptr i_ += N + F - 1; } @@ -1217,7 +1211,7 @@ void sp_iconv3d_warm(int32_t *o, int32_t *i, int32_t *f, int64_t M, int64_t N, if ((k | ch) == 0) asm volatile("vmul.vx v28, v0, %0" ::"r"(f[0 + base_idx_0])); else - asm volatile("vslide1down.vx v6, v4, %0" ::"r"(*i_slide_ptr_1++)); + asm volatile("vslide1down.vx v6, v4, %0" ::"r"(*i_slide_ptr_1++)); asm volatile("vslide1down.vx v10, v8, %0" ::"r"(*i_slide_ptr_2++)); asm volatile("vslide1down.vx v14, v12, %0" ::"r"(*i_slide_ptr_3++)); @@ -1226,10 +1220,8 @@ void sp_iconv3d_warm(int32_t *o, int32_t *i, int32_t *f, int64_t M, int64_t N, asm volatile("vslide1down.vx v8, v10, %0" ::"r"(*i_slide_ptr_2++)); asm volatile("vslide1down.vx v12, v14, %0" ::"r"(*i_slide_ptr_3++)); } - } - // Bump the input ptr i_ += 4 * (N + F - 1); @@ -1272,7 +1264,6 @@ void sp_iconv3d_warm(int32_t *o, int32_t *i, int32_t *f, int64_t M, int64_t N, } } - /* //////////////////// // MAIN ALGOMITHM // diff --git a/apps/dtype-conv3d/kernel/sp-iconv3d.h b/apps/dtype-conv3d/kernel/sp-iconv3d.h index c5c6b5066..cbe001401 100644 --- a/apps/dtype-conv3d/kernel/sp-iconv3d.h +++ b/apps/dtype-conv3d/kernel/sp-iconv3d.h @@ -19,10 +19,10 @@ #ifndef ICONV3D_H #define ICONV3D_H +#include "printf.h" +#include "util.h" #include #include -#include "util.h" -#include "printf.h" // Threshold for FP numbers comparison during the final check #define THRESHOLD 0 @@ -34,19 +34,19 @@ #define DATA_WIDTH "int32" void sp_iconv3d_CHx7x7(int32_t *o, int32_t *i, int32_t *f, int64_t M, int64_t N, - int64_t C, int64_t F); + int64_t C, int64_t F); -void sp_iconv3d_CHx7x7_block(int32_t *o, int32_t *i, int32_t *f, int64_t M, int64_t N, - int64_t n_, int64_t C, int64_t F); +void sp_iconv3d_CHx7x7_block(int32_t *o, int32_t *i, int32_t *f, int64_t M, + int64_t N, int64_t n_, int64_t C, int64_t F); -void sp_iconv3d_CHx7x7_warm(int32_t *o, int32_t *i, int32_t *f, int64_t M, int64_t N, - int64_t C, int64_t F); +void sp_iconv3d_CHx7x7_warm(int32_t *o, int32_t *i, int32_t *f, int64_t M, + int64_t N, int64_t C, int64_t F); void sp_iconv3d_warm(int32_t *o, int32_t *i, int32_t *f, int64_t M, int64_t N, - int64_t n_, int64_t C, int64_t F); + int64_t n_, int64_t C, int64_t F); // Verify the matrices -int sp_iconv3d_verify(int32_t *matrix, int32_t *golden_matrix, int64_t R, int64_t C, - int64_t threshold); +int sp_iconv3d_verify(int32_t *matrix, int32_t *golden_matrix, int64_t R, + int64_t C, int64_t threshold); #endif diff --git a/apps/dtype-conv3d/main.c b/apps/dtype-conv3d/main.c index da6f80b4a..53b1ba640 100644 --- a/apps/dtype-conv3d/main.c +++ b/apps/dtype-conv3d/main.c @@ -125,8 +125,8 @@ int main() { float utilization = 100 * performance / (2.0 * NR_LANES * DTYPE_FACTOR); printf("The execution took %d cycles.\n", runtime); - printf("The performance is %f %s-OP/cycle (%f%% utilization).\n", - performance, DTYPE_PREFIX, utilization); + printf("The performance is %f %s-OP/cycle (%f%% utilization).\n", performance, + DTYPE_PREFIX, utilization); // Verify correctness printf("Verifying result...\n");