From c33aee3dbce5371d901e7b653c1fa7037e430607 Mon Sep 17 00:00:00 2001 From: Diyou Shen Date: Tue, 3 Dec 2024 14:26:58 +0100 Subject: [PATCH] [SW] Update main files for better Cache/SPM configurations. --- sw/spatzBenchmarks/dp-faxpy-cache/main.c | 2 +- sw/spatzBenchmarks/dp-faxpy/main.c | 2 +- sw/spatzBenchmarks/dp-fdotp/main.c | 86 ++++++++++++------- sw/spatzBenchmarks/dp-fft-cache/main.c | 2 +- sw/spatzBenchmarks/dp-fft/main.c | 7 +- sw/spatzBenchmarks/dp-fmatmul-4x4vl/main.c | 17 +++- sw/spatzBenchmarks/dp-fmatmul-8x2vl/main.c | 16 +++- sw/spatzBenchmarks/dp-fmatmul/main.c | 17 +++- .../dp-mxfmatmul-m4n4k4-b2/main.c | 8 +- .../dp-mxfmatmul-m8n4k4-b2/main.c | 8 +- .../dp-mxfmatmul-m8n4k4-b4/main.c | 10 +-- 11 files changed, 124 insertions(+), 51 deletions(-) diff --git a/sw/spatzBenchmarks/dp-faxpy-cache/main.c b/sw/spatzBenchmarks/dp-faxpy-cache/main.c index 0f78cbb..af4fbbf 100644 --- a/sw/spatzBenchmarks/dp-faxpy-cache/main.c +++ b/sw/spatzBenchmarks/dp-faxpy-cache/main.c @@ -44,7 +44,7 @@ int main() { if (cid == 0) { // Init the cache - l1d_init(32); + l1d_init(16); } // Wait for all cores to finish diff --git a/sw/spatzBenchmarks/dp-faxpy/main.c b/sw/spatzBenchmarks/dp-faxpy/main.c index 59c79f3..af82f3c 100644 --- a/sw/spatzBenchmarks/dp-faxpy/main.c +++ b/sw/spatzBenchmarks/dp-faxpy/main.c @@ -44,7 +44,7 @@ int main() { if (cid == 0) { // Init the cache - l1d_init(32); + l1d_init(120); } // Reset timer diff --git a/sw/spatzBenchmarks/dp-fdotp/main.c b/sw/spatzBenchmarks/dp-fdotp/main.c index 85a24ab..e59e1da 100644 --- a/sw/spatzBenchmarks/dp-fdotp/main.c +++ b/sw/spatzBenchmarks/dp-fdotp/main.c @@ -23,6 +23,8 @@ #include DATAHEADER #include "kernel/fdotp.c" +#define USE_CACHE + double *a; double *b; double *result; @@ -41,9 +43,14 @@ static inline int fp_check(const double a, const double b) { int main() { const unsigned int num_cores = snrt_cluster_core_num(); const unsigned int cid = snrt_cluster_core_idx(); + const int measure_iter = 1; + + #ifdef USE_CACHE + uint32_t spm_size = 16; + #else + uint32_t spm_size = 120; + #endif - uint32_t spm_size = 32; - if (cid == 0) { // Init the cache l1d_init(spm_size); @@ -54,10 +61,20 @@ int main() { // Reset timer unsigned int timer = (unsigned int)-1; + unsigned int timer_tmp = 0; const unsigned int dim = dotp_l.M / num_cores; // Allocate the matrices + #ifdef USE_CACHE + if (cid == 0) { + result = (double *)snrt_l1alloc(num_cores * sizeof(double)); + } + + double *a_int = dotp_A_dram + dim * cid; + double *b_int = dotp_B_dram + dim * cid; + + #else if (cid == 0) { a = (double *)snrt_l1alloc(dotp_l.M * sizeof(double)); b = (double *)snrt_l1alloc(dotp_l.M * sizeof(double)); @@ -78,42 +95,51 @@ int main() { double *a_int = a + dim * cid; double *b_int = b + dim * cid; + #endif + + // Wait for all cores to finish snrt_cluster_hw_barrier(); - // Start dump - if (cid == 0) - start_kernel(); + for (int iter = 0; iter < measure_iter; iter ++) { + // Start dump + if (cid == 0) + start_kernel(); - // Start timer - if (cid == 0) - timer = benchmark_get_cycle(); + // Start timer + if (cid == 0) + timer_tmp = benchmark_get_cycle(); - // Calculate dotp - double acc; - acc = fdotp_v64b(a_int, b_int, dim); - result[cid] = acc; + // Calculate dotp + double acc; + acc = fdotp_v64b(a_int, b_int, dim); + result[cid] = acc; - // Wait for all cores to finish - snrt_cluster_hw_barrier(); + // Wait for all cores to finish + snrt_cluster_hw_barrier(); - // Final reduction - if (cid == 0) { - for (unsigned int i = 1; i < num_cores; ++i) - acc += result[i]; - result[0] = acc; - } + // Final reduction + if (cid == 0) { + for (unsigned int i = 1; i < num_cores; ++i) + acc += result[i]; + result[0] = acc; + } - // Wait for all cores to finish - snrt_cluster_hw_barrier(); + // Wait for all cores to finish + snrt_cluster_hw_barrier(); - // End dump - if (cid == 0) - stop_kernel(); + // End dump + if (cid == 0) + stop_kernel(); - // End timer and check if new best runtime - if (cid == 0) - timer = benchmark_get_cycle() - timer; + // End timer and check if new best runtime + if (cid == 0) { + timer_tmp = benchmark_get_cycle() - timer_tmp; + timer = (timer < timer_tmp) ? timer : timer_tmp; + } + + snrt_cluster_hw_barrier(); + } // Check and display results if (cid == 0) { @@ -127,8 +153,8 @@ int main() { } if (cid == 0) - if (fp_check(result[0], dotp_result)) { - printf("Error: Result = %f, Golden = %f\n", result[0], dotp_result); + if (fp_check(result[0], dotp_result*measure_iter)) { + printf("Error: Result = %f, Golden = %f\n", result[0], dotp_result*measure_iter); return -1; } diff --git a/sw/spatzBenchmarks/dp-fft-cache/main.c b/sw/spatzBenchmarks/dp-fft-cache/main.c index ae3cac7..6b43092 100644 --- a/sw/spatzBenchmarks/dp-fft-cache/main.c +++ b/sw/spatzBenchmarks/dp-fft-cache/main.c @@ -42,7 +42,7 @@ int main() { if (cid == 0) { // Init the cache with half-half - l1d_init(32); + l1d_init(16); } // Wait for all cores to finish snrt_cluster_hw_barrier(); diff --git a/sw/spatzBenchmarks/dp-fft/main.c b/sw/spatzBenchmarks/dp-fft/main.c index 4750812..65415fc 100644 --- a/sw/spatzBenchmarks/dp-fft/main.c +++ b/sw/spatzBenchmarks/dp-fft/main.c @@ -47,7 +47,7 @@ int main() { if (cid == 0) { // Init the cache - l1d_init(32); + l1d_init(120); } // log2(nfft). @@ -66,6 +66,9 @@ int main() { bitrev = (uint16_t *)snrt_l1alloc((NFFT / 4) * sizeof(uint16_t)); } + timer = benchmark_get_cycle(); + + // Initialize the matrices if (cid == 0) { snrt_dma_start_1d(samples, samples_dram, 2 * NFFT * sizeof(double)); @@ -90,7 +93,7 @@ int main() { snrt_cluster_hw_barrier(); // Start timer - timer = benchmark_get_cycle(); + // timer = benchmark_get_cycle(); // Start dump if (cid == 0) diff --git a/sw/spatzBenchmarks/dp-fmatmul-4x4vl/main.c b/sw/spatzBenchmarks/dp-fmatmul-4x4vl/main.c index 24fe207..2666e41 100644 --- a/sw/spatzBenchmarks/dp-fmatmul-4x4vl/main.c +++ b/sw/spatzBenchmarks/dp-fmatmul-4x4vl/main.c @@ -23,6 +23,8 @@ #include DATAHEADER #include "kernel/dp-fmatmul.c" +#define USE_CACHE + #ifndef KERNEL_SIZE #define KERNEL_SIZE 4 #endif @@ -61,8 +63,12 @@ int main() { unsigned int m_start, m_end; unsigned int p_start, p_end; unsigned int kernel_size; + + #ifdef USE_CACHE + uint32_t spm_size = 16; + #else uint32_t spm_size = 120; - + #endif if (cid == 0) { // Init the cache l1d_init(spm_size); @@ -70,11 +76,18 @@ int main() { // Wait for all cores to finish snrt_cluster_hw_barrier(); + // Allocate the matrices in the local tile if (cid == 0) { + #ifdef USE_CACHE + a = gemm_A_dram; + b = gemm_B_dram; + c = gemm_C_dram; + #else a = (double *)snrt_l1alloc(gemm_l.M * gemm_l.K * sizeof(double)); b = (double *)snrt_l1alloc(gemm_l.K * gemm_l.N * sizeof(double)); c = (double *)snrt_l1alloc(gemm_l.M * gemm_l.N * sizeof(double)); + #endif } // Reset timer @@ -93,12 +106,14 @@ int main() { snrt_cluster_hw_barrier(); // Initialize matrices + #ifndef USE_CACHE if (cid == 0) { snrt_dma_start_1d(a, gemm_A_dram, gemm_l.M * gemm_l.K * sizeof(double)); snrt_dma_start_1d(b, gemm_B_dram, gemm_l.K * gemm_l.N * sizeof(double)); snrt_dma_start_1d(c, gemm_C_dram, gemm_l.M * gemm_l.N * sizeof(double)); snrt_dma_wait_all(); } + #endif // Wait for all cores to finish snrt_cluster_hw_barrier(); diff --git a/sw/spatzBenchmarks/dp-fmatmul-8x2vl/main.c b/sw/spatzBenchmarks/dp-fmatmul-8x2vl/main.c index 48c2118..5e6b339 100644 --- a/sw/spatzBenchmarks/dp-fmatmul-8x2vl/main.c +++ b/sw/spatzBenchmarks/dp-fmatmul-8x2vl/main.c @@ -23,6 +23,8 @@ #include DATAHEADER #include "kernel/dp-fmatmul.c" +// #define USE_CACHE + #ifndef KERNEL_SIZE #define KERNEL_SIZE 8 #endif @@ -53,8 +55,12 @@ int verify_matrix(double *matrix, const double *checksum, int main() { const unsigned int num_cores = snrt_cluster_core_num(); const unsigned int cid = snrt_cluster_core_idx(); + + #ifdef USE_CACHE + uint32_t spm_size = 16; + #else uint32_t spm_size = 120; - + #endif if (cid == 0) { // Init the cache l1d_init(spm_size); @@ -72,9 +78,15 @@ int main() { // Allocate the matrices in the local tile if (cid == 0) { + #ifdef USE_CACHE + a = gemm_A_dram; + b = gemm_B_dram; + c = gemm_C_dram; + #else a = (double *)snrt_l1alloc(gemm_l.M * gemm_l.K * sizeof(double)); b = (double *)snrt_l1alloc(gemm_l.K * gemm_l.N * sizeof(double)); c = (double *)snrt_l1alloc(gemm_l.M * gemm_l.N * sizeof(double)); + #endif } // Reset timer @@ -93,12 +105,14 @@ int main() { snrt_cluster_hw_barrier(); // Initialize matrices + #ifndef USE_CACHE if (cid == 0) { snrt_dma_start_1d(a, gemm_A_dram, gemm_l.M * gemm_l.K * sizeof(double)); snrt_dma_start_1d(b, gemm_B_dram, gemm_l.K * gemm_l.N * sizeof(double)); snrt_dma_start_1d(c, gemm_C_dram, gemm_l.M * gemm_l.N * sizeof(double)); snrt_dma_wait_all(); } + #endif // Wait for all cores to finish snrt_cluster_hw_barrier(); diff --git a/sw/spatzBenchmarks/dp-fmatmul/main.c b/sw/spatzBenchmarks/dp-fmatmul/main.c index 007c18f..9688301 100644 --- a/sw/spatzBenchmarks/dp-fmatmul/main.c +++ b/sw/spatzBenchmarks/dp-fmatmul/main.c @@ -23,6 +23,8 @@ #include DATAHEADER #include "kernel/dp-fmatmul.c" +#define USE_CACHE + double *a; double *b; double *c; @@ -49,8 +51,13 @@ int verify_matrix(double *matrix, const double *checksum, int main() { const unsigned int num_cores = snrt_cluster_core_num(); const unsigned int cid = snrt_cluster_core_idx(); + + #ifdef USE_CACHE + uint32_t spm_size = 32; + #else uint32_t spm_size = 120; - + #endif + if (cid == 0) { // Init the cache l1d_init(spm_size); @@ -68,9 +75,15 @@ int main() { // Allocate the matrices in the local tile if (cid == 0) { + #ifdef USE_CACHE + a = gemm_A_dram; + b = gemm_B_dram; + c = gemm_C_dram; + #else a = (double *)snrt_l1alloc(gemm_l.M * gemm_l.K * sizeof(double)); b = (double *)snrt_l1alloc(gemm_l.K * gemm_l.N * sizeof(double)); c = (double *)snrt_l1alloc(gemm_l.M * gemm_l.N * sizeof(double)); + #endif } // Reset timer @@ -89,12 +102,14 @@ int main() { snrt_cluster_hw_barrier(); // Initialize matrices + #ifndef USE_CACHE if (cid == 0) { snrt_dma_start_1d(a, gemm_A_dram, gemm_l.M * gemm_l.K * sizeof(double)); snrt_dma_start_1d(b, gemm_B_dram, gemm_l.K * gemm_l.N * sizeof(double)); snrt_dma_start_1d(c, gemm_C_dram, gemm_l.M * gemm_l.N * sizeof(double)); snrt_dma_wait_all(); } + #endif // Wait for all cores to finish snrt_cluster_hw_barrier(); diff --git a/sw/spatzBenchmarks/dp-mxfmatmul-m4n4k4-b2/main.c b/sw/spatzBenchmarks/dp-mxfmatmul-m4n4k4-b2/main.c index 0f72832..06ef570 100644 --- a/sw/spatzBenchmarks/dp-mxfmatmul-m4n4k4-b2/main.c +++ b/sw/spatzBenchmarks/dp-mxfmatmul-m4n4k4-b2/main.c @@ -124,7 +124,7 @@ int main() { #ifndef USE_CACHE uint32_t spm_size = 120; // 120 KB out of 128 KB #else - uint32_t spm_size = 32; // Reserve small portion for SPM + uint32_t spm_size = 16; // Reserve small portion for stack only #endif if (cid == 0) { @@ -142,7 +142,7 @@ int main() { b = (double *)snrt_l1alloc(gemm_l.K * gemm_l.N * sizeof(double)); c = (double *)snrt_l1alloc(gemm_l.M * gemm_l.N * sizeof(double)); } -#else +#else a = gemm_A_dram; b = gemm_B_dram; c = gemm_C_dram; @@ -176,7 +176,7 @@ int main() { snrt_dma_start_1d(c, gemm_C_dram, gemm_l.M * gemm_l.N * sizeof(double)); snrt_dma_wait_all(); } -#endif +#endif // Wait for all cores to finish snrt_cluster_hw_barrier(); @@ -251,7 +251,7 @@ int main() { for (unsigned int j = 0; j < gemm_l.N; j++) { checksum += c[i * gemm_l.N + j]; } - printf("Checksum[%d]=%f\n", i, checksum); + // printf("Checksum[%d]=%f\n", i, checksum); double diff = checksum - (double)gemm_checksum[i]; if (diff < 0) diff = -diff; diff --git a/sw/spatzBenchmarks/dp-mxfmatmul-m8n4k4-b2/main.c b/sw/spatzBenchmarks/dp-mxfmatmul-m8n4k4-b2/main.c index cd972d4..23b0b44 100644 --- a/sw/spatzBenchmarks/dp-mxfmatmul-m8n4k4-b2/main.c +++ b/sw/spatzBenchmarks/dp-mxfmatmul-m8n4k4-b2/main.c @@ -126,7 +126,7 @@ int main() { #else uint32_t spm_size = 32; // Reserve small portion for SPM #endif - + if (cid == 0) { // Init the cache l1d_init(spm_size); @@ -142,7 +142,7 @@ int main() { b = (double *)snrt_l1alloc(gemm_l.K * gemm_l.N * sizeof(double)); c = (double *)snrt_l1alloc(gemm_l.M * gemm_l.N * sizeof(double)); } -#else +#else a = gemm_A_dram; b = gemm_B_dram; c = gemm_C_dram; @@ -176,7 +176,7 @@ int main() { snrt_dma_start_1d(c, gemm_C_dram, gemm_l.M * gemm_l.N * sizeof(double)); snrt_dma_wait_all(); } -#endif +#endif // Wait for all cores to finish snrt_cluster_hw_barrier(); @@ -251,7 +251,7 @@ int main() { for (unsigned int j = 0; j < gemm_l.N; j++) { checksum += c[i * gemm_l.N + j]; } - printf("Checksum[%d]=%f\n", i, checksum); + // printf("Checksum[%d]=%f\n", i, checksum); double diff = checksum - (double)gemm_checksum[i]; if (diff < 0) diff = -diff; diff --git a/sw/spatzBenchmarks/dp-mxfmatmul-m8n4k4-b4/main.c b/sw/spatzBenchmarks/dp-mxfmatmul-m8n4k4-b4/main.c index 9c1adf6..66640ba 100644 --- a/sw/spatzBenchmarks/dp-mxfmatmul-m8n4k4-b4/main.c +++ b/sw/spatzBenchmarks/dp-mxfmatmul-m8n4k4-b4/main.c @@ -124,9 +124,9 @@ int main() { #ifndef USE_CACHE uint32_t spm_size = 120; // 120 KB out of 128 KB #else - uint32_t spm_size = 32; // Reserve small portion for SPM + uint32_t spm_size = 16; // Reserve small portion for SPM #endif - + if (cid == 0) { // Init the cache l1d_init(spm_size); @@ -142,7 +142,7 @@ int main() { b = (double *)snrt_l1alloc(gemm_l.K * gemm_l.N * sizeof(double)); c = (double *)snrt_l1alloc(gemm_l.M * gemm_l.N * sizeof(double)); } -#else +#else a = gemm_A_dram; b = gemm_B_dram; c = gemm_C_dram; @@ -176,7 +176,7 @@ int main() { snrt_dma_start_1d(c, gemm_C_dram, gemm_l.M * gemm_l.N * sizeof(double)); snrt_dma_wait_all(); } -#endif +#endif // Wait for all cores to finish snrt_cluster_hw_barrier(); @@ -251,7 +251,7 @@ int main() { for (unsigned int j = 0; j < gemm_l.N; j++) { checksum += c[i * gemm_l.N + j]; } - printf("Checksum[%d]=%f\n", i, checksum); + // printf("Checksum[%d]=%f\n", i, checksum); double diff = checksum - (double)gemm_checksum[i]; if (diff < 0) diff = -diff;