diff --git a/sw/dnn/src/softmax.h b/sw/dnn/src/softmax.h index f58d48fe63..6a6db8436a 100644 --- a/sw/dnn/src/softmax.h +++ b/sw/dnn/src/softmax.h @@ -1,12 +1,13 @@ // Copyright 2020 ETH Zurich and University of Bologna. // Licensed under the Apache License, Version 2.0, see LICENSE for details. // SPDX-License-Identifier: Apache-2.0 +// +// Viviane Potocnik, ETH Zurich, #pragma once #include "math.h" #include "snrt.h" -// #include "printf.h" #include "utils.h" /** @@ -50,8 +51,11 @@ static inline void softmax_fp32(float *input, float *output, int32_t ldI, float max_core = 0.0; // max value of the current core float sum = 0.0; // sum of the exp values of the current core - // uint32_t compute_id = snrt_global_core_idx(); - // uint32_t num_cores = snrt_cluster_compute_core_num(); + uint32_t compute_id = snrt_global_core_idx(); + uint32_t num_cores = snrt_cluster_compute_core_num(); + + // printf("Hello from core [%d/%d]\n", compute_id, num_cores); + // dump_softmax(compute_id); for (int32_t b = 0; b < batch_size; b++) { for (int32_t s = 0; s < seq_len; s++) { @@ -59,6 +63,7 @@ static inline void softmax_fp32(float *input, float *output, int32_t ldI, sum = 0.0; for (int32_t i = 0; i < input_samples; i++) { + // dump_input(input[b * batch_offset + s * ldI + i]); if (input[b * batch_offset + s * ldI + i] > max_core) { max_core = input[b * batch_offset + s * ldI + i]; } @@ -69,18 +74,15 @@ static inline void softmax_fp32(float *input, float *output, int32_t ldI, output[b * batch_offset + s * ldI + i] = // FIXME: Below code is erroring due to the standard math // lib conflict - // TODO: Try out with musl lib - // expf(input[b * batch_offset + s * ldI + i] - max_core); - // FIXME: actually there should be an exponentiation - input[b * batch_offset + s * ldI + i] - max_core; + expf(input[b * batch_offset + s * ldI + i] - max_core); + // input[b * batch_offset + s * ldI + i] - max_core; sum += output[b * batch_offset + s * ldI + i]; } // compute the softmax value of the current row for (int32_t i = 0; i < input_samples; i++) { - // INFO: DIVSQRT unit MUST be activated in the cluster - // configuration output[b * batch_offset + s * ldI + i] /= sum; + // dump_output(output[b * batch_offset + s * ldI + i]); // printf("output[%d] = %f\n", compute_id * input_samples + b * // batch_offset + s * ldI + i, // output[b * batch_offset + s * ldI + i]);