From 499fed21006fa4dc1aa2093b0fc745421402c0a3 Mon Sep 17 00:00:00 2001 From: Luka Macan Date: Sun, 14 Jan 2024 19:37:51 +0100 Subject: [PATCH] Add neureka support similar to ne16 --- inc/pulp_nnx_neureka.h | 77 ++++ neureka/bsp/neureka_siracusa_bsp.c | 93 ++++ neureka/bsp/neureka_siracusa_bsp.h | 81 ++++ neureka/gvsoc/neureka_gvsoc.h | 54 +++ .../pulp_nnx_error_codes.h => hal/neureka.c} | 27 +- .../{src/pulp_nnx_util.c => hal/neureka.h} | 22 +- neureka/hal/neureka_task.c | 234 ++++++++++ neureka/hal/neureka_task.h | 173 ++++++++ neureka/hal/neureka_task_defs.h | 114 +++++ neureka/inc/pulp_nnx_defs.h | 167 ------- neureka/inc/pulp_nnx_hal.h | 217 --------- neureka/inc/pulp_nnx_util.h | 27 -- neureka/src/pulp_nnx_hal.c | 412 ------------------ src/pulp_nnx_neureka.c | 131 ++++++ 14 files changed, 988 insertions(+), 841 deletions(-) create mode 100644 inc/pulp_nnx_neureka.h create mode 100644 neureka/bsp/neureka_siracusa_bsp.c create mode 100644 neureka/bsp/neureka_siracusa_bsp.h create mode 100644 neureka/gvsoc/neureka_gvsoc.h rename neureka/{inc/pulp_nnx_error_codes.h => hal/neureka.c} (53%) rename neureka/{src/pulp_nnx_util.c => hal/neureka.h} (62%) create mode 100644 neureka/hal/neureka_task.c create mode 100644 neureka/hal/neureka_task.h create mode 100644 neureka/hal/neureka_task_defs.h delete mode 100644 neureka/inc/pulp_nnx_defs.h delete mode 100644 neureka/inc/pulp_nnx_hal.h delete mode 100644 neureka/inc/pulp_nnx_util.h delete mode 100644 neureka/src/pulp_nnx_hal.c create mode 100644 src/pulp_nnx_neureka.c diff --git a/inc/pulp_nnx_neureka.h b/inc/pulp_nnx_neureka.h new file mode 100644 index 0000000..48e16f1 --- /dev/null +++ b/inc/pulp_nnx_neureka.h @@ -0,0 +1,77 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "neureka.h" +#include "neureka_pulp_bsp.h" +#include "neureka_task.h" +#include + +/* PULP-NNX interface */ + +void neureka_nnx_init(neureka_dev_t *dev, neureka_pulp_conf_t *conf); +void neureka_nnx_term(neureka_dev_t *dev); + +/** neureka_nnx_dispatch_check + * + * Check whether you can dispatch to the accelerator. + */ +int neureka_nnx_dispatch_check(neureka_dev_t *dev); + +/** neureka_nnx_dispatch_wait + * + * Block until you can dispatch to the accelerator. + */ +void neureka_nnx_dispatch_wait(neureka_dev_t *dev); + +/** neureka_nnx_dispatch + * + * Dispatch a task to the accelerator. + * Fails with return code 1 if the task cannot be dispatched. Otherwise returns 0. + */ +int neureka_nnx_dispatch(neureka_dev_t *dev, neureka_task_t *task); + +/** neureka_nnx_resolve_check + * + * Check whether the task has been resolved. + */ +int neureka_nnx_resolve_check(neureka_dev_t *dev, neureka_task_t *task); + +/** neureka_nnx_resolve_wait + * + * Block until you can resolve the task. + */ +void neureka_nnx_resolve_wait(neureka_dev_t *dev, neureka_task_t *task); + + +/* Additional helper functions */ + +/** neureka_nnx_dispatch_stride2x2 + * + * It uses Neureka's 2x2 strided mode which reduces the number of writes Neureka does. + * This mode doesn't stride the Neureka's subtile input pointer, so we have to + * tile the tile to the subtile's spatial dimensions (in this case 3x3 output). + * Works only if the k_out is divisible by 2. + */ +void neureka_nnx_dispatch_stride2x2( + neureka_dev_t *dev, neureka_task_t *task, const uint32_t w_in, const uint32_t k_in, + const uint32_t w_in_stride, const uint32_t k_in_stride, + const uint32_t h_out, const uint32_t w_out, const uint32_t k_out, + const uint32_t w_out_stride, const uint32_t k_out_stride, + const uint8_t h_ker, const uint8_t w_ker); diff --git a/neureka/bsp/neureka_siracusa_bsp.c b/neureka/bsp/neureka_siracusa_bsp.c new file mode 100644 index 0000000..28deda0 --- /dev/null +++ b/neureka/bsp/neureka_siracusa_bsp.c @@ -0,0 +1,93 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "neureka_siracusa_bsp.h" +#include + +#define NEUREKA_SIRACUSA_CLUSTER_CTRL_BASE_ADDR (0x00200000) +#define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_OFFS 0x18 +#define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR \ + (NEUREKA_SIRACUSA_CLUSTER_CTRL_ADDR_BASE + NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_OFFS) +#define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_CG_EN 0x800 +#define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_PRIO 0x100 +#define NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_MAXSTALL 0xff +#define NEUREKA_SIRACUSA_MAX_STALL (8) +#define NEUREKA_SIRACUSA_EVENT_0 (1 << 12) +#define NEUREKA_SIRACUSA_EVENT_1 (1 << 13) +#define NEUREKA_SIRACUSA_BASE_ADDR (0x00201000) +#define NEUREKA_SIRACUSA_WEIGHT_MEM_BASE_ADDR (0x10400000) +#define NEUREKA_SIRACUSA_WEIGHT_MEM_MRAM_OFFSET (0x00000000) +#define NEUREKA_SIRACUSA_WEIGHT_MEM_SRAM_OFFSET (0x00400000) + +void neureka_siracusa_cg_enable() { + *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR |= + NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_CG_EN; +} + +void neureka_siracusa_cg_disable() { + *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR &= + ~NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_CG_EN; +} + +// TODO: Check if needed for neureka +void neureka_siracusa_hci_setpriority_neureka() { + *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR |= + NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_PRIO; +} + +// TODO: Check if needed for neureka +void neureka_siracusa_hci_setpriority_core() { + *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR &= + ~NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_PRIO; +} + +// TODO: Check if needed for neureka +void neureka_siracusa_hci_reset_max_stall() { + *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR &= + ~NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_MAXSTALL; +} + +// TODO: Check if needed for neureka +void neureka_siracusa_hci_set_max_stall(uint32_t max_stall) { + *(volatile uint32_t *)NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_ADDR |= + max_stall & NEUREKA_SIRACUSA_CLUSTER_CTRL_HWPE_MASK_HCI_MAXSTALL; +} + +void neureka_siracusa_open(neureka_siracusa_conf_t *conf) { + neureka_siracusa_cg_enable(); + neureka_siracusa_hci_setpriority_neureka(); + neureka_siracusa_hci_set_max_stall(conf->max_stall); +} + +void neureka_siracusa_close() { + neureka_siracusa_hci_reset_max_stall(); + neureka_siracusa_hci_setpriority_core(); + neureka_siracusa_cg_disable(); +} + +void neureka_siracusa_event_wait_and_clear() { + eu_evt_maskWaitAndClr(NEUREKA_SIRACUSA_EVENT); +} + +static const neureka_dev_t neureka_siracusa_dev = { + .hwpe_dev = (struct hwpe_dev_t){ + .base_addr = (volatile uint32_t *)NEUREKA_SIRACUSA_BASE_ADDR}}; + +const neureka_dev_t *neureka_siracusa_get_dev() { return &neureka_siracusa_dev; } diff --git a/neureka/bsp/neureka_siracusa_bsp.h b/neureka/bsp/neureka_siracusa_bsp.h new file mode 100644 index 0000000..9e879e8 --- /dev/null +++ b/neureka/bsp/neureka_siracusa_bsp.h @@ -0,0 +1,81 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __NEUREKA_siracusa_BSP_H__ +#define __NEUREKA_siracusa_BSP_H__ + +#include "neureka.h" +#include + +/** + * neureka_siracusa_cg_enable + * + * Enable clock gating of the neureka. + */ +void neureka_siracusa_cg_enable(); + +/** + * neureka_siracusa_cg_enable + * + * Disable clock gating of the neureka. + */ +void neureka_siracusa_cg_disable(); + +/** + * neureka_siracusa_setpriority_neureka + * + * Set HCI interconnect bus priority to prioritize neureka. + */ +void neureka_siracusa_hci_setpriority_neureka(); + +/** + * neureka_siracusa_setpriority_core + * + * Set HCI bus priority to prioritize cores. + */ +void neureka_siracusa_hci_setpriority_core(); + +/** + * neureka_siracusa_hci_reset_maxstall + * + * Reset the HCI bus maxstall parameter. + * TODO: Check if it disables it also or just resets? + */ +void neureka_siracusa_hci_reset_max_stall(); + +/** + * neureka_siracusa_hci_set_maxstall + * + * Set the HCI bus maxstall. Maxstall defines how many cycles + * will the HCI bus stall the lower priority master, i.e. neureka or core, + * before letting it do a transaction. + */ +void neureka_siracusa_hci_set_max_stall(uint32_t max_stall); + +typedef struct neureka_siracusa_conf_t { + int max_stall; +} neureka_siracusa_conf_t; + +void neureka_siracusa_open(neureka_siracusa_conf_t *conf); +void neureka_siracusa_close(); +void neureka_siracusa_event_wait_and_clear(); +const neureka_dev_t *neureka_siracusa_get_dev(); + +#endif // !__NEUREKA_siracusa_BSP_H__ diff --git a/neureka/gvsoc/neureka_gvsoc.h b/neureka/gvsoc/neureka_gvsoc.h new file mode 100644 index 0000000..37eeab0 --- /dev/null +++ b/neureka/gvsoc/neureka_gvsoc.h @@ -0,0 +1,54 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __NEUREKA_GVSOC_H__ +#define __NEUREKA_GVSOC_H__ + +#include "neureka.h" +#include "neureka_task.h" + +#define NEUREKA_REG_GVSOC_LOG_LEVEL 24 +#define NEUREKA_REG_GVSOC_LOG_FORMAT 25 + +typedef enum neureka_gvsoc_log_format_e { + NEUREKA_GVSOC_LOG_FORMAT_DECIMAL = 0, + NEUREKA_GVSOC_LOG_FORMAT_HEXADECIMAL = 3 +} neureka_gvsoc_log_format_e; + +typedef enum neureka_gvsoc_log_level_e { + NEUREKA_GVSOC_LOG_LEVEL_JOB_START_END = 0, + NEUREKA_GVSOC_LOG_LEVEL_CONFIG = 1, + NEUREKA_GVSOC_LOG_LEVEL_ACTIV_INOUT = 2, + NEUREKA_GVSOC_LOG_LEVEL_ALL = 3 +} neureka_gvsoc_log_level_e; + +static void neureka_gvsoc_log_activate(neureka_dev_t *dev, + neureka_gvsoc_log_level_e log_level, + neureka_gvsoc_log_format_e format) { + hwpe_task_reg_write(&dev->hwpe_dev, NEUREKA_REG_GVSOC_LOG_LEVEL, log_level); + hwpe_task_reg_write(&dev->hwpe_dev, NEUREKA_REG_GVSOC_LOG_FORMAT, format); +} + +static void neureka_gvsoc_log_deactivate(neureka_dev_t *dev) { + hwpe_task_reg_write(&dev->hwpe_dev, NEUREKA_REG_GVSOC_LOG_LEVEL, + NEUREKA_GVSOC_LOG_LEVEL_JOB_START_END); +} + +#endif // __NEUREKA_GVSOC_H__ diff --git a/neureka/inc/pulp_nnx_error_codes.h b/neureka/hal/neureka.c similarity index 53% rename from neureka/inc/pulp_nnx_error_codes.h rename to neureka/hal/neureka.c index dc71575..ebcad93 100644 --- a/neureka/inc/pulp_nnx_error_codes.h +++ b/neureka/hal/neureka.c @@ -18,15 +18,22 @@ * SPDX-License-Identifier: Apache-2.0 */ -#ifndef __NE16_ERROR_CODES_H__ -#define __NE16_ERROR_CODES_H__ +#include "neureka.h" -typedef enum { - success = 0, - weightBitwidthOutOfBounds, - unsupportedWeightOffsetMode, - unsupportedFeatureBitwidth, - dimensionMismatch -} nnx_error_code; +#define NEUREKA_STATUS_EMPTY (0x000) +#define NEUREKA_STATUS_FULL (0x101) -#endif // __NE16_ERROR_CODES_H__ \ No newline at end of file +inline int neureka_task_queue_size(neureka_dev_t *dev) { return 2; } + +inline int neureka_task_queue_tasks_in_flight(neureka_dev_t *dev) { + uint32_t status = hwpe_task_queue_status(&dev->hwpe_dev); + return (status & 0x1) + ((status >> 8) & 0x1); +} + +inline int neureka_task_queue_empty(neureka_dev_t *dev) { + return hwpe_task_queue_status(&dev->hwpe_dev) == NEUREKA_STATUS_EMPTY; +} + +inline int neureka_task_queue_full(neureka_dev_t *dev) { + return hwpe_task_queue_status(&dev->hwpe_dev) == NEUREKA_STATUS_FULL; +} diff --git a/neureka/src/pulp_nnx_util.c b/neureka/hal/neureka.h similarity index 62% rename from neureka/src/pulp_nnx_util.c rename to neureka/hal/neureka.h index daaaf2b..887d995 100644 --- a/neureka/src/pulp_nnx_util.c +++ b/neureka/hal/neureka.h @@ -18,13 +18,19 @@ * SPDX-License-Identifier: Apache-2.0 */ -#include "pulp_nnx_util.h" -#include "pulp_nnx_hal.h" +#ifndef __NEUREKA_H__ +#define __NEUREKA_H__ -void nnx_activate_gvsoc_logging(int log_level) { - NEUREKA_WRITE_IO_REG(NEUREKA_REG_GVSOC_TRACE, log_level); -} +#include "hwpe.h" +#include -void nnx_deactivate_gvsoc_logging() { - NEUREKA_WRITE_IO_REG(NEUREKA_REG_GVSOC_TRACE, 0); -} +typedef struct neureka_dev_t { + hwpe_dev_t hwpe_dev; /* Implements the HWPE device interface */ +} neureka_dev_t; + +int neureka_task_queue_size(neureka_dev_t *dev); +int neureka_task_queue_tasks_in_flight(neureka_dev_t *dev); +int neureka_task_queue_empty(neureka_dev_t *dev); +int neureka_task_queue_full(neureka_dev_t *dev); + +#endif // __NEUREKA_H__ diff --git a/neureka/hal/neureka_task.c b/neureka/hal/neureka_task.c new file mode 100644 index 0000000..943c373 --- /dev/null +++ b/neureka/hal/neureka_task.c @@ -0,0 +1,234 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "neureka_task.h" +#include "neureka_task_defs.h" +#include "pulp_nnx_util.h" + +inline uint32_t neureka_get_tile_padding(uint32_t padding, uint32_t i_height, + uint32_t i_width, uint32_t n_height, + uint32_t n_width) { + uint32_t tile_padding = padding; + if (i_height > 0) { + tile_padding &= ~(0xf << 28); + } + if (i_width < n_width - 1) { + tile_padding &= ~(0xf << 24); + } + if (i_height < n_height - 1) { + tile_padding &= ~(0xf << 20); + } + if (i_width > 0) { + tile_padding &= ~(0xf << 16); + } + return tile_padding; +} + +void neureka_task_init(neureka_task_t *task, const uint8_t kernel_shape, + const uint8_t depthwise, const uint8_t input_bits, + const uint8_t output_bits, const uint8_t weights_bits, + const neureka_weight_offset_mode_e weights_offset_mode, + const uint32_t weights_offset_factor, neureka_quant_t quant, + neureka_norm_t norm, const uint8_t stride) { + const uint32_t flag_mode16 = + input_bits == 16 ? NEUREKA_FLAG_MODE16 : NEUREKA_FLAG_MODE_BASIC; + + *task = (neureka_task_t){ + .outbytes = output_bits / 8, + .weight_d0_stride = flag_mode16 ? NEUREKA_WEIGHT_D0_STRIDE_MODE16 + : kernel_shape == 3 ? + NEUREKA_WEIGHT_D0_STRIDE_MODE8_3x3 : + NEUREKA_WEIGHT_D0_STRIDE_MODE8, + .qw = weights_bits, + .stride_shift = stride == 2 ? 1 : 0, + .output_channel_throughput = depthwise ? NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3 + : NEUREKA_OUTPUT_CHANNEL_THROUGHPUT, + .input_channel_throughput = kernel_shape == 3 ? NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3 + : NEUREKA_INPUT_CHANNEL_THROUGHPUT, + .kernel_shape = kernel_shape, + .depthwise = depthwise, + .data = {0}}; + + const int flag_stride2x2 = stride == 2 ? NEUREKA_FLAG_STRIDE_2x2 : 0; + + const int flag_mode = kernel_shape == 1 ? NEUREKA_FLAG_MODE_1x1 + : depthwise == 1 ? NEUREKA_FLAG_MODE_3x3_DW + : NEUREKA_FLAG_MODE_3x3; + + task->data.cfg.conf0 |= + NEUREKA_FLAG_NORM_QUANT | quant.function | quant.mode | + (quant.shift_amount << 16) | quant.flag_rounding << NEUREKA_SHIFT_ROUNDING | + norm.mode | norm.flag_bias << NEUREKA_SHIFT_FLAG_NORM_BIAS | + norm.flag_shift << NEUREKA_SHIFT_FLAG_NORM_SHIFT | weights_offset_mode | + flag_mode | flag_mode16 | (weights_bits - 1) | flag_stride2x2; + + task->data.cfg.weight_offset_factor = weights_offset_factor; +} + +/** neureka_pad_ptr + * + * Calculate the pointer to the start of the ptr as if + * it was the start to the padded data. + * Necessary for input pointer when it's padded. + */ +inline uint32_t neureka_pad_ptr(uint32_t ptr, const uint32_t width, + const uint32_t channel, const uint8_t bits, + const uint8_t padding_top, + const uint8_t padding_left) { + return ptr - (padding_top * width + padding_left) * channel * bits / 8; +} + +inline void neureka_task_set_ptrs(neureka_task_t *task, uint32_t input_ptr, + uint32_t w_in, uint32_t k_in, uint8_t bits_in, + uint8_t padding_top, uint8_t padding_left, + uint32_t output_ptr, uint32_t weights_ptr, + uint32_t scale_ptr, uint32_t shift_ptr, + uint32_t bias_ptr) { + task->data.infeat_ptr = + neureka_pad_ptr(input_ptr, w_in, k_in, bits_in, padding_top, padding_left); + task->data.outfeat_ptr = output_ptr; + task->data.weights_ptr = weights_ptr; + task->data.scale_ptr = scale_ptr; + task->data.scale_shift_ptr = shift_ptr; + task->data.scale_bias_ptr = bias_ptr; +} + +void neureka_task_set_strides(neureka_task_t *task, const uint32_t k_in, + const uint32_t w_in_stride, + const uint32_t k_in_stride, + const uint32_t w_out_stride, + const uint32_t k_out_stride) { + const uint32_t num_k_in = divnceil(k_in, NEUREKA_INPUT_CHANNEL_THROUGHPUT); + + const neureka_stride_t input_stride = { + .d0 = k_in_stride, + .d1 = k_in_stride * w_in_stride, + .d2 = task->depthwise ? 0 : + task->kernel_shape == 1 ? k_in_stride * 3 * 3 : // TODO: Check this magic + k_in_stride * NEUREKA_FILTER_BUFFER_SIZE * NEUREKA_FILTER_BUFFER_SIZE}; + task->data.cfg.input_stride = input_stride; + + // WARNING: Stride works only for even output channel sizes (divisible by 2) + const neureka_stride_t output_stride = { + .d0 = 32, + .d1 = (k_out_stride * task->outbytes) >> task->stride_shift, + .d2 = + (k_out_stride * task->outbytes * w_out_stride) >> task->stride_shift}; + task->data.cfg.output_stride = output_stride; + + if (task->kernel_shape == 1) { + task->data.cfg.weights_stride.d0 = task->weight_d0_stride * task->qw; + task->data.cfg.weights_stride.d1 = + task->weight_d0_stride * task->qw * num_k_in; + task->data.cfg.weights_stride.d2 = 0; + } else if (!task->depthwise) { + task->data.cfg.weights_stride.d0 = task->weight_d0_stride; + task->data.cfg.weights_stride.d1 = task->weight_d0_stride * task->qw * + num_k_in; + task->data.cfg.weights_stride.d2 = 0; + } else { + task->data.cfg.weights_stride.d0 = + NEUREKA_FILTER_SIZE * NEUREKA_FILTER_SIZE * task->weight_d0_stride; + task->data.cfg.weights_stride.d1 = 0; + task->data.cfg.weights_stride.d2 = 0; + } +} + +void neureka_task_set_counters(neureka_task_t *task, const uint32_t k_in, + const uint32_t h_out, const uint32_t w_out, + const uint32_t k_out, const uint8_t padding_bottom, + const uint8_t padding_right) { + const uint16_t num_Ko = divnceil(k_out, task->output_channel_throughput); + const uint16_t num_Ki = divnceil(k_in, task->input_channel_throughput); + const uint16_t num_Ho = divnceil(h_out, NEUREKA_FILTER_SIZE); + const uint16_t num_Wo = divnceil(w_out, NEUREKA_FILTER_SIZE); + + const uint16_t rem_Ko = remainder(k_out, task->output_channel_throughput); + const uint16_t rem_Ki = remainder(k_in, task->input_channel_throughput); + const uint16_t rem_Ho = remainder(h_out, NEUREKA_FILTER_SIZE); + const uint16_t rem_Wo = remainder(w_out, NEUREKA_FILTER_SIZE); + const uint16_t rem_Hi = + (task->kernel_shape == 1 ? rem_Ho : rem_Ho + 2) - padding_bottom; // TODO: Check padding bottom + const uint16_t rem_Wi = + (task->kernel_shape == 1 ? rem_Wo : rem_Wo + 2) - padding_right; // TODO: Check padding right + + const neureka_subtile_t subtile = { + .number = {.KoKi = concat_half(num_Ko, num_Ki), + .HoWo = concat_half(num_Ho, num_Wo)}, + .remainder = {.KoKi = concat_half(rem_Ko, rem_Ki), + .HoWo = concat_half(rem_Ho, rem_Wo), + .HiWi = concat_half(rem_Hi, rem_Wi)}}; + task->data.cfg.subtile = subtile; +} + +inline void neureka_task_set_padding(neureka_task_t *task, const uint8_t top, + const uint8_t bottom, const uint8_t left, + const uint8_t right, const uint8_t value) { + task->data.cfg.padding = ((top & 0xf) << 28) | ((right & 0xf) << 24) | + ((bottom & 0xf) << 20) | ((left & 0xf) << 16) | + (value & 0xff); +} + +inline void neureka_task_set_mask_filter(neureka_task_t *task, const uint8_t top, + const uint8_t right, const uint8_t bottom, + const uint8_t left) { + task->data.cfg.filter_mask = ((top & 0xff) << 24) | ((right & 0xff) << 16) | + ((bottom & 0xff) << 8) | ((left & 0xff) << 0); +} + +void neureka_task_set_dims(neureka_task_t *task, const uint32_t w_in, + const uint32_t k_in, const uint32_t w_in_stride, + const uint32_t k_in_stride, const uint32_t h_out, + const uint32_t w_out, const uint32_t k_out, + const uint32_t w_out_stride, const uint32_t k_out_stride, + const uint8_t padding_top, const uint8_t padding_bottom, + const uint8_t padding_right, + const uint8_t padding_left) { + neureka_task_set_strides(task, k_in, w_in_stride, k_in_stride, w_out_stride, + k_out_stride); + neureka_task_set_counters(task, k_in, h_out, w_out, k_out, padding_bottom, + padding_right); + neureka_task_set_padding(task, padding_top, padding_bottom, padding_left, + padding_right, 0); +} + +void neureka_task_set_dims_stride2x2( + neureka_task_t *task, const uint32_t h_in, const uint32_t w_in, + const uint32_t k_in, const uint32_t w_in_stride, const uint32_t k_in_stride, + const uint32_t h_out, const uint32_t w_out, const uint32_t k_out, + const uint32_t w_out_stride, const uint32_t k_out_stride, + const uint8_t h_ker, const uint8_t w_ker, const uint8_t padding_top, + const uint8_t padding_bottom, const uint8_t padding_right, + const uint8_t padding_left) { + const uint8_t stride = 2; + + neureka_task_set_strides(task, k_in, w_in_stride, k_in_stride, w_out_stride, + k_out_stride); + neureka_task_set_counters(task, k_in, h_out > 1 ? 3 : 1, w_out > 1 ? 3 : 1, + k_out, h_in + padding_top >= 5 ? 0 : padding_bottom, 0); + + const uint8_t padding_bottom_new = + (h_in + padding_top - h_ker) % stride == 0 ? 0 : padding_bottom; + const uint8_t padding_right_new = + (w_in + padding_left - w_ker) % stride == 0 ? 0 : padding_right; + + neureka_task_set_padding(task, padding_top, padding_bottom_new, padding_left, + padding_right_new, 0); +} diff --git a/neureka/hal/neureka_task.h b/neureka/hal/neureka_task.h new file mode 100644 index 0000000..7f4c31b --- /dev/null +++ b/neureka/hal/neureka_task.h @@ -0,0 +1,173 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __NEUREKA_TASK_H__ +#define __NEUREKA_TASK_H__ + +#include "neureka_task_defs.h" +#include + +typedef enum neureka_task_flag_e { + neurekaTaskFlagFalse = 0, + neurekaTaskFlagTrue = 1 +} neureka_task_flag_e; + +typedef enum neureka_weight_offset_mode_e { + weightOffsetModeSymmetric = NEUREKA_FLAG_WEIGHT_OFFSET_SYMMETRIC, + weightOffsetModeLayerWise = NEUREKA_FLAG_WEIGHT_OFFSET_LAYER_WISE +} neureka_weight_offset_mode_e; + +typedef enum { + normMode8Bit = NEUREKA_NORM_MODE_8BIT, + normMode16Bit = NEUREKA_NORM_MODE_16BIT, + normMode32Bit = NEUREKA_NORM_MODE_32BIT +} neureka_norm_mode_e; + +typedef struct neureka_norm_t { + neureka_norm_mode_e mode; + int flag_bias; + int flag_shift; +} neureka_norm_t; + +typedef enum neureka_quant_mode_e { + quantMode8Bit = NEUREKA_QUANT_MODE_8BIT, + quantMode16Bit = NEUREKA_QUANT_MODE_16BIT, + quantMode32Bit = NEUREKA_QUANT_MODE_32BIT +} neureka_quant_mode_e; + +typedef enum neureka_quant_function_e { + quantFunctionIdentity = NEUREKA_FLAG_QUANT_FUNCTION_IDENTITY, + quantFunctionRelu = NEUREKA_FLAG_QUANT_FUNCTION_RELU +} neureka_quant_function_e; + +typedef struct neureka_quant_t { + // Shift amount must be in range 0x00-0x1F + unsigned shift_amount; + neureka_quant_mode_e mode; + neureka_quant_function_e function; + int flag_rounding; +} neureka_quant_t; + +typedef struct neureka_stride_t { + uint32_t d0; + uint32_t d1; + uint32_t d2; +} neureka_stride_t; + +typedef struct neureka_subtile_remainder_t { + uint32_t KoKi; + uint32_t HoWo; + uint32_t HiWi; +} neureka_subtile_remainder_t; + +typedef struct neureka_subtile_number_t { + uint32_t KoKi; + uint32_t HoWo; +} neureka_subtile_number_t; + +typedef struct neureka_subtile_t { + neureka_subtile_remainder_t remainder; + neureka_subtile_number_t number; +} neureka_subtile_t; + +typedef struct neureka_cfg_t { + neureka_stride_t input_stride; + neureka_stride_t output_stride; + neureka_stride_t weights_stride; + neureka_subtile_t subtile; + uint32_t padding; + uint32_t weight_offset_factor; + uint32_t filter_mask; + uint32_t conf0; +} neureka_cfg_t; + +typedef struct neureka_task_data_t { + uint32_t weights_ptr; + uint32_t infeat_ptr; + uint32_t outfeat_ptr; + uint32_t scale_ptr; + uint32_t scale_shift_ptr; + uint32_t scale_bias_ptr; + neureka_cfg_t cfg; +} neureka_task_data_t; + +typedef struct neureka_task_t { + neureka_task_data_t data; + uint8_t outbytes; + uint8_t weight_d0_stride; + uint8_t qw; + uint8_t stride_shift; + uint8_t output_channel_throughput; + uint8_t input_channel_throughput; + uint8_t kernel_shape; + uint8_t depthwise; + uint8_t id; +} neureka_task_t; + +void neureka_task_init(neureka_task_t *task, const uint8_t kernel_shape, + const uint8_t depthwise, const uint8_t input_bits, + const uint8_t output_bits, const uint8_t weights_bits, + const neureka_weight_offset_mode_e weights_offset_mode, + const uint32_t weights_offset_factor, neureka_quant_t quant, + neureka_norm_t norm, const uint8_t stride); +uint32_t neureka_get_tile_padding(uint32_t padding, uint32_t i_height, + uint32_t i_width, uint32_t n_height, + uint32_t n_width); +uint32_t neureka_pad_ptr(uint32_t ptr, const uint32_t width, + const uint32_t channel, const uint8_t bits, + const uint8_t padding_top, const uint8_t padding_left); +void neureka_task_set_ptrs(neureka_task_t *task, uint32_t input_ptr, uint32_t w_in, + uint32_t k_in, uint8_t bits_in, uint8_t padding_top, + uint8_t padding_left, uint32_t output_ptr, + uint32_t weights_ptr, uint32_t scale_ptr, + uint32_t shift_ptr, uint32_t bias_ptr); +void neureka_task_set_strides(neureka_task_t *task, const uint32_t k_in, + const uint32_t w_in_stride, + const uint32_t k_in_stride, + const uint32_t w_out_stride, + const uint32_t k_out_stride); +void neureka_task_set_counters(neureka_task_t *task, const uint32_t k_in, + const uint32_t h_out, const uint32_t w_out, + const uint32_t k_out, const uint8_t padding_bottom, + const uint8_t padding_right); +void neureka_task_set_padding(neureka_task_t *task, const uint8_t top, + const uint8_t bottom, const uint8_t left, + const uint8_t right, const uint8_t value); +void neureka_task_set_mask_filter(neureka_task_t *task, const uint8_t top, + const uint8_t right, const uint8_t bottom, + const uint8_t left); +void neureka_task_set_dims(neureka_task_t *task, const uint32_t w_in, + const uint32_t k_in, const uint32_t w_in_stride, + const uint32_t k_in_stride, const uint32_t h_out, + const uint32_t w_out, const uint32_t k_out, + const uint32_t w_out_stride, const uint32_t k_out_stride, + const uint8_t padding_top, const uint8_t padding_bottom, + const uint8_t padding_right, + const uint8_t padding_left); +void neureka_task_set_dims_stride2x2( + neureka_task_t *task, const uint32_t h_in, const uint32_t w_in, + const uint32_t k_in, const uint32_t w_in_stride, const uint32_t k_in_stride, + const uint32_t h_out, const uint32_t w_out, const uint32_t k_out, + const uint32_t w_out_stride, const uint32_t k_out_stride, + const uint8_t h_ker, const uint8_t w_ker, const uint8_t padding_top, + const uint8_t padding_bottom, const uint8_t padding_right, + const uint8_t padding_left); + +#endif // !__NEUREKA_TASK_H__ diff --git a/neureka/hal/neureka_task_defs.h b/neureka/hal/neureka_task_defs.h new file mode 100644 index 0000000..daa9897 --- /dev/null +++ b/neureka/hal/neureka_task_defs.h @@ -0,0 +1,114 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __NEUREKA_DEFS_H__ +#define __NEUREKA_DEFS_H__ + +/* ARHITECTURE */ + +#define NEUREKA_FILTER_SIZE (6) +#define NEUREKA_FILTER_BUFFER_SIZE (8) +#define NEUREKA_INPUT_CHANNEL_THROUGHPUT (32) +#define NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3 (28) +#define NEUREKA_OUTPUT_CHANNEL_THROUGHPUT (32) +#define NEUREKA_WEIGHT_BANDWIDTH (256) + +#define NEUREKA_WEIGHT_D0_STRIDE_MODE8 (NEUREKA_INPUT_CHANNEL_THROUGHPUT / 8) +#define NEUREKA_WEIGHT_D0_STRIDE_MODE8_3x3 (NEUREKA_WEIGHT_BANDWIDTH / 8) +#define NEUREKA_WEIGHT_D0_STRIDE_MODE16 (NEUREKA_INPUT_CHANNEL_THROUGHPUT / 16) + +/* TASK REGISTERS */ + +// job configuration +#define NEUREKA_REG_WEIGHTS_PTR 0 +#define NEUREKA_REG_INFEAT_PTR 1 +#define NEUREKA_REG_OUTFEAT_PTR 2 +#define NEUREKA_REG_SCALE_PTR 3 +#define NEUREKA_REG_SCALE_SHIFT_PTR 4 +#define NEUREKA_REG_SCALE_BIAS_PTR 5 +#define NEUREKA_REG_INFEAT_D0_STRIDE 6 +#define NEUREKA_REG_INFEAT_D1_STRIDE 7 +#define NEUREKA_REG_INFEAT_D2_STRIDE 8 +#define NEUREKA_REG_OUTFEAT_D0_STRIDE 9 +#define NEUREKA_REG_OUTFEAT_D1_STRIDE 10 +#define NEUREKA_REG_OUTFEAT_D2_STRIDE 11 +#define NEUREKA_REG_WEIGHTS_D0_STRIDE 12 +#define NEUREKA_REG_WEIGHTS_D1_STRIDE 13 +#define NEUREKA_REG_WEIGHTS_D2_STRIDE 14 +#define NEUREKA_REG_SUBTILE_REMAINDER_0 15 +#define NEUREKA_REG_SUBTILE_REMAINDER_1 16 +#define NEUREKA_REG_SUBTILE_REMAINDER_2 17 +#define NEUREKA_REG_SUBTILE_NUMBER_0 18 +#define NEUREKA_REG_SUBTILE_NUMBER_1 19 +#define NEUREKA_REG_PADDING 20 +#define NEUREKA_REG_WEIGHT_OFFSET_FACTOR 21 +#define NEUREKA_REG_FILTER_MASKING 22 +#define NEUREKA_REG_CONF0 23 + +/* SHIFT */ + +#define NEUREKA_SHIFT_FLAG_NORM_BIAS (25) +#define NEUREKA_SHIFT_FLAG_NORM_SHIFT (24) +#define NEUREKA_SHIFT_QUANT_SHIFT (16) +#define NEUREKA_SHIFT_ROUNDING (11) + +/* CONF0 FLAGS */ + +#define NEUREKA_FLAG_NORM_BIAS (1 << 25) +#define NEUREKA_FLAG_NORM_SHIFT (1 << 24) +#define NEUREKA_FLAG_QUANT_FUNCTION_IDENTITY (1 << 23) +#define NEUREKA_FLAG_QUANT_FUNCTION_RELU (0 << 23) +#define NEUREKA_QUANT_MODE_8BIT (0 << 21) +#define NEUREKA_QUANT_MODE_16BIT (1 << 21) +#define NEUREKA_QUANT_MODE_32BIT (2 << 21) +// conf0[20:16] - quantization shift amount +#define NEUREKA_FLAG_WEIGHT_OFFSET_SYMMETRIC (0 << 15) +#define NEUREKA_FLAG_WEIGHT_OFFSET_LAYER_WISE (1 << 15) +#define NEUREKA_FLAG_STREAMIN (1 << 14) +#define NEUREKA_NORM_MODE_8BIT (0 << 12) +#define NEUREKA_NORM_MODE_16BIT (1 << 12) +#define NEUREKA_NORM_MODE_32BIT (2 << 12) +#define NEUREKA_FLAG_ROUND (1 << 11) +#define NEUREKA_FLAG_ACTIVATION_PREFETCH (1 << 10) +#define NEUREKA_FLAG_USE_WMEM (1 << 9) +#define NEUREKA_FLAG_USE_TCDM (0 << 9) +#define NEUREKA_FLAG_STRIDE_2x2 (1 << 8) // TODO: Check if the `STRIDED` mode is still `STRIDE_2x2` +#define NEUREKA_FLAG_LINEAR_MODE (1 << 7) +#define NEUREKA_FLAG_MODE_3x3 (0 << 5) +#define NEUREKA_FLAG_MODE_3x3_DW (1 << 5) +#define NEUREKA_FLAG_MODE_1x1 (2 << 5) +#define NEUREKA_FLAG_NORM_QUANT (1 << 4) +#define NEUREKA_FLAG_MODE_BASIC (0 << 3) +#define NEUREKA_FLAG_MODE16 (1 << 3) + +/* Masks */ + +#define NEUREKA_MASK_QUANT_FUNCTION (1 << 23) +#define NEUREKA_MASK_QUANT_MODE (3 << 21) + +/* PADDING */ + +#define NEUREKA_DONT_PAD (0) +#define NEUREKA_MAX_PAD (2) + +/* NORM */ +#define NEUREKA_NORM_MAX_LEN (32) + +#endif // __NEUREKA_DEFS_H__ diff --git a/neureka/inc/pulp_nnx_defs.h b/neureka/inc/pulp_nnx_defs.h deleted file mode 100644 index e8ecba5..0000000 --- a/neureka/inc/pulp_nnx_defs.h +++ /dev/null @@ -1,167 +0,0 @@ -/* - * Luka Macan - * Arpan Prasad - * - * Copyright 2023 ETH Zurich and University of Bologna - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * SPDX-License-Identifier: Apache-2.0 - */ - -#ifndef __NEUREKA_DEFS_H__ -#define __NEUREKA_DEFS_H__ - -/* ARHITECTURE */ - -#define NEUREKA_FILTER_SIZE (6) -#define NEUREKA_FILTER_BUFFER_SIZE (8) -#define NEUREKA_INPUT_CHANNEL_THROUGHPUT (32) -#define NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3 (28) -#define NEUREKA_OUTPUT_CHANNEL_THROUGHPUT (32) -#define NEUREKA_CONTEXT_SIZE (2) -#define NEUREKA_WEIGHT_BANDWIDTH (256) - -#define NEUREKA_WEIGHT_D0_STRIDE_MODE8 (NEUREKA_INPUT_CHANNEL_THROUGHPUT / 8) -#define NEUREKA_WEIGHT_D0_STRIDE_MODE8_3x3 (NEUREKA_WEIGHT_BANDWIDTH / 8) -#define NEUREKA_WEIGHT_D0_STRIDE_MODE16 (NEUREKA_INPUT_CHANNEL_THROUGHPUT / 16) - -/* REGISTER MAP */ - -#define NEUREKA_EVT0 12 -#define NEUREKA_EVT1 13 -#define NEUREKA_BASE_ADDR 0x00201000 -#define WEIGHT_MEM_BASE 0x10400000 -#define SRAM_OFFSET 0x00400000 -#define MRAM_OFFSET 0x00000000 - -// Cluster -#define CLUSTER_CTRL_BASE_ADDR 0x00200000 -#define CLUSTER_CTRL_HWPE_OFFS 0x18 -#define CLUSTER_CTRL_HWPE_CG_EN_MASK 0x800 - -/* REGISTER OFFSETS */ - -// commands -#define NEUREKA_TRIGGER 0x00 -#define NEUREKA_ACQUIRE 0x04 -#define NEUREKA_FINISHED 0x08 -#define NEUREKA_STATUS 0x0C -#define NEUREKA_RUNNING_JOB 0x10 -#define NEUREKA_SOFT_CLEAR 0x14 -#define NEUREKA_SWSYNC 0x18 -#define NEUREKA_URISCY_IMEM 0x1C - -// job configuration -#define NEUREKA_REGISTER_OFFSET 0x20 - -#define NEUREKA_REG_WEIGHTS_PTR 0x00 -#define NEUREKA_REG_INFEAT_PTR 0x04 -#define NEUREKA_REG_OUTFEAT_PTR 0x08 -#define NEUREKA_REG_SCALE_PTR 0x0C -#define NEUREKA_REG_SCALE_SHIFT_PTR 0x10 -#define NEUREKA_REG_SCALE_BIAS_PTR 0x14 -#define NEUREKA_REG_INFEAT_D0_STRIDE 0x18 -#define NEUREKA_REG_INFEAT_D1_STRIDE 0x1C -#define NEUREKA_REG_INFEAT_D2_STRIDE 0x20 -#define NEUREKA_REG_OUTFEAT_D0_STRIDE 0x24 -#define NEUREKA_REG_OUTFEAT_D1_STRIDE 0x28 -#define NEUREKA_REG_OUTFEAT_D2_STRIDE 0x2C -#define NEUREKA_REG_WEIGHTS_D0_STRIDE 0x30 -#define NEUREKA_REG_WEIGHTS_D1_STRIDE 0x34 -#define NEUREKA_REG_WEIGHTS_D2_STRIDE 0x38 -#define NEUREKA_REG_SUBTILE_REMAINDER_0 0x3C -#define NEUREKA_REG_SUBTILE_REMAINDER_1 0x40 -#define NEUREKA_REG_SUBTILE_REMAINDER_2 0x44 -#define NEUREKA_REG_SUBTILE_NUMBER_0 0x48 -#define NEUREKA_REG_SUBTILE_NUMBER_1 0x4C -#define NEUREKA_REG_PADDING 0x50 -#define NEUREKA_REG_WEIGHT_OFFSET_FACTOR 0x54 -#define NEUREKA_REG_FILTER_MASKING 0x58 -#define NEUREKA_REG_CONF0 0x5C - -// Simulation only -#define NEUREKA_REG_GVSOC_TRACE 0x60 - -/* SHIFT */ - -#define NEUREKA_SHIFT_FLAG_NORM_BIAS (25) -#define NEUREKA_SHIFT_FLAG_NORM_SHIFT (24) -#define NEUREKA_SHIFT_QUANT_SHIFT (16) -#define NEUREKA_SHIFT_ROUNDING (11) - -/* CONF0 FLAGS */ - -#define NEUREKA_FLAG_NORM_BIAS (1 << 25) -#define NEUREKA_FLAG_NORM_SHIFT (1 << 24) -#define NEUREKA_FLAG_QUANT_FUNCTION_IDENTITY (1 << 23) -#define NEUREKA_FLAG_QUANT_FUNCTION_RELU (0 << 23) -#define NEUREKA_QUANT_MODE_8BIT (0 << 21) -#define NEUREKA_QUANT_MODE_16BIT (1 << 21) -#define NEUREKA_QUANT_MODE_32BIT (2 << 21) -// conf0[20:16] - quantization shift amount -#define NEUREKA_FLAG_WEIGHT_OFFSET_SYMMETRIC (0 << 15) -#define NEUREKA_FLAG_WEIGHT_OFFSET_LAYER_WISE (1 << 15) -#define NEUREKA_FLAG_STREAMIN (1 << 14) -#define NEUREKA_NORM_MODE_8BIT (0 << 12) -#define NEUREKA_NORM_MODE_16BIT (1 << 12) -#define NEUREKA_NORM_MODE_32BIT (2 << 12) -#define NEUREKA_FLAG_ROUND (1 << 11) -#define NEUREKA_FLAG_ACTIVATION_PREFETCH (1 << 10) -#define NEUREKA_FLAG_USE_WMEM (1 << 9) -#define NEUREKA_FLAG_USE_TCDM (0 << 9) -#define NEUREKA_FLAG_STRIDED_MODE (1 << 8) -#define NEUREKA_FLAG_LINEAR_MODE (1 << 7) -#define NEUREKA_FLAG_MODE_3x3 (0 << 5) -#define NEUREKA_FLAG_MODE_3x3_DW (1 << 5) -#define NEUREKA_FLAG_MODE_1x1 (2 << 5) -#define NEUREKA_FLAG_NORM_QUANT (1 << 4) -#define NEUREKA_FLAG_MODE_BASIC (0 << 3) -#define NEUREKA_FLAG_MODE16 (1 << 3) - -/* Masks */ - -#define NEUREKA_MASK_QUANT_FUNCTION (1 << 23) -#define NEUREKA_MASK_QUANT_MODE (3 << 21) - -/* Miscellaneous */ - -// Padding -#define MAX_PAD (0xf) - -// Normalization -#define NEUREKA_NORM_MAX_LEN (32) -#define NO_NORM(length) \ - { \ - .scale = scale_identity, .bias = NEUREKA_NULL, .shift = NEUREKA_NULL, \ - .length = length, .mode = normMode32Bit \ - } - -// Quantization -#define NO_QUANT \ - { \ - .shift_amount = 0, .mode = quantMode32Bit, \ - .function = quantFunctionIdentity \ - } - -// GVSOC trace levels -#define NEUREKA_TRACE_LEVEL_JOB_START_END 0 -#define NEUREKA_TRACE_LEVEL_CONFIG 1 -#define NEUREKA_TRACE_LEVEL_ACTIV_INOUT 2 -#define NEUREKA_TRACE_LEVEL_ALL 3 - -// null -#define NEUREKA_NULL ((void *)0) -#define NEUREKA_STATUS_FULL (0x101) - -#endif // __NEUREKA_DEFS_H__ diff --git a/neureka/inc/pulp_nnx_hal.h b/neureka/inc/pulp_nnx_hal.h deleted file mode 100644 index 40bcec0..0000000 --- a/neureka/inc/pulp_nnx_hal.h +++ /dev/null @@ -1,217 +0,0 @@ -/* - * Luka Macan - * - * Copyright 2023 ETH Zurich and University of Bologna - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * SPDX-License-Identifier: Apache-2.0 - */ - -#ifndef __NEUREKA_H__ -#define __NEUREKA_H__ - -#include - -#include "pulp_nnx_defs.h" -#include "pulp_nnx_error_codes.h" - -#define NEUREKA_CG_ENABLE() \ - *(volatile int *)(CLUSTER_CTRL_BASE_ADDR + CLUSTER_CTRL_HWPE_OFFS) |= \ - CLUSTER_CTRL_HWPE_CG_EN_MASK -#define NEUREKA_CG_DISABLE() \ - *(volatile int *)(CLUSTER_CTRL_BASE_ADDR + CLUSTER_CTRL_HWPE_OFFS) &= \ - ~CLUSTER_CTRL_HWPE_CG_EN_MASK - -#define NEUREKA_WRITE(offset, value) \ - *(int volatile *)(NEUREKA_BASE_ADDR + (offset)) = (value) -#define NEUREKA_WRITE_BE(offset, value, be) \ - *(char volatile *)(NEUREKA_BASE_ADDR + (offset) + (be)) = (value) -#define NEUREKA_READ(offset) *(int volatile *)(NEUREKA_BASE_ADDR + (offset)) - -#define NEUREKA_WRITE_IO_REG(offset, value) \ - NEUREKA_WRITE(NEUREKA_REGISTER_OFFSET + (offset), (value)) -#define NEUREKA_WRITE_IO_REG_BE(offset, value, be) \ - NEUREKA_WRITE_BE(NEUREKA_REGISTER_OFFSET + (offset), (value), (be)) -#define NEUREKA_READ_IO_REG(offset) \ - NEUREKA_READ(NEUREKA_REGISTER_OFFSET + (offset)) - -#define NEUREKA_BARRIER_NOSTATUS() eu_evt_maskWaitAndClr(1 << NEUREKA_EVT0) -#define NEUREKA_BARRIER() \ - do { \ - eu_evt_maskWaitAndClr(1 << NEUREKA_EVT0); \ - } while ((*(int volatile *)(NEUREKA_BASE_ADDR + NEUREKA_STATUS)) != 0) -#define NEUREKA_BUSYWAIT() \ - do { \ - } while ((*(int volatile *)(NEUREKA_BASE_ADDR + NEUREKA_STATUS)) != 0) -#define NEUREKA_BARRIER_ACQUIRE(job_id) \ - job_id = NEUREKA_READ(NEUREKA_ACQUIRE); \ - while (job_id < 0) { \ - eu_evt_maskWaitAndClr(1 << NEUREKA_EVT0); \ - job_id = NEUREKA_READ(NEUREKA_ACQUIRE); \ - }; -#define NEUREKA_NOBARRIER_ACQUIRE(job_id) \ - job_id = NEUREKA_READ(NEUREKA_ACQUIRE); \ - while (job_id < 0) { \ - job_id = NEUREKA_READ(NEUREKA_ACQUIRE); \ - }; - -#define DIVNCEIL(A, B) (((A - 1) / B) + 1) -#define REMAINDER(A, B) (((A - 1) % B) + 1) -#define CONCAT_HALF(A, B) (((A & 0xffff) << 16) | (B & 0xffff)) - -#define NNX_CONTEXT_SIZE NEUREKA_CONTEXT_SIZE - -#define FLAG_USED (1) -#define FLAG_UNUSED (0) - -typedef enum { - weightOffsetModeSymmetric = NEUREKA_FLAG_WEIGHT_OFFSET_SYMMETRIC, - weightOffsetModeLayerWise = NEUREKA_FLAG_WEIGHT_OFFSET_LAYER_WISE -} nnx_weight_offset_mode_e; - -typedef struct { - void *data; - uint16_t height; - uint16_t width; - uint16_t depth; - uint16_t n_weights; - uint32_t bitwidth; - int32_t offset_factor; - nnx_weight_offset_mode_e offset_mode; -} nnx_weights_t; - -typedef enum { - featureBitwidth8Bit = 8, - featureBitwidth16Bit = 16, - featureBitwidth32Bit = 32 -} nnx_feature_bitwidth_e; - -typedef struct { - void *data; - uint16_t height; - uint16_t width; - uint16_t depth; - nnx_feature_bitwidth_e bitwidth; -} nnx_feature_t; - -typedef enum { - normMode8Bit = NEUREKA_NORM_MODE_8BIT, - normMode16Bit = NEUREKA_NORM_MODE_16BIT, - normMode32Bit = NEUREKA_NORM_MODE_32BIT -} nnx_norm_mode_e; - -typedef struct { - nnx_norm_mode_e mode; - int flag_bias; - int flag_shift; -} nnx_norm_t; - -typedef enum { - quantMode8Bit = NEUREKA_QUANT_MODE_8BIT, - quantMode16Bit = NEUREKA_QUANT_MODE_16BIT, - quantMode32Bit = NEUREKA_QUANT_MODE_32BIT -} nnx_quant_mode_e; - -typedef enum { - quantFunctionIdentity = NEUREKA_FLAG_QUANT_FUNCTION_IDENTITY, - quantFunctionRelu = NEUREKA_FLAG_QUANT_FUNCTION_RELU -} nnx_quant_function_e; - -// TODO: add rounding to quant. Should also be an enum? Best boolean... -typedef struct { - // Shift amount must be in range 0x00-0x1F - unsigned shift_amount; - nnx_quant_mode_e mode; - nnx_quant_function_e function; - int flag_rounding; -} nnx_quant_t; - -typedef struct { - uint32_t d0; - uint32_t d1; - uint32_t d2; -} nnx_stride_t; - -typedef struct { - uint32_t KoKi; - uint32_t HoWo; - uint32_t HiWi; -} nnx_subtile_remainder_t; - -typedef struct { - uint32_t KoKi; - uint32_t HoWo; -} nnx_subtile_number_t; - -typedef struct { - nnx_subtile_remainder_t remainder; - nnx_subtile_number_t number; -} nnx_subtile_t; - -typedef struct { - nnx_stride_t input_stride; - nnx_stride_t output_stride; - nnx_stride_t weights_stride; - nnx_subtile_t subtile; - uint32_t padding; - uint32_t weight_offset_factor; - uint32_t filter_mask; - uint32_t conf0; -} nnx_cfg_t; - -typedef struct { - uint32_t weights_ptr; - uint32_t infeat_ptr; - uint32_t outfeat_ptr; - uint32_t scale_ptr; - uint32_t scale_shift_ptr; - uint32_t scale_bias_ptr; - nnx_cfg_t cfg; -} nnx_task_t; - -int nnx_job_id(); -int nnx_empty(); -int nnx_full(); -void nnx_soft_clear(); -int nnx_acquire(); -void nnx_offload(nnx_task_t *task); -void nnx_offload_ptr(nnx_task_t *task); -void nnx_run_async(); -void nnx_run_blocking(); -void nnx_commit(); -void nnx_wait_empty(); -void nnx_wait_not_full(); -void nnx_wait_on_id(int id); -void nnx_busywait(); - -void nnx_task_init(nnx_task_t *task); -int nnx_pad_input(nnx_cfg_t *cfg, uint32_t top, uint32_t right, uint32_t bottom, - uint32_t left, uint16_t value); -int nnx_norm_quant(nnx_cfg_t *cfg, nnx_norm_t norm, nnx_quant_t quant); -void nnx_mask_filter(nnx_cfg_t *cfg, uint8_t top, uint8_t right, uint8_t bottom, - uint8_t left); -nnx_error_code nnx_conv_1x1(nnx_cfg_t *cfg, nnx_weights_t weights, - nnx_feature_t input, nnx_feature_t output); -nnx_error_code nnx_conv_1x1_update_dims(nnx_cfg_t *cfg, int h_out, int w_out, - int k_out, int k_in); -nnx_error_code nnx_conv_3x3(nnx_cfg_t *cfg, nnx_weights_t weights, - nnx_feature_t input, nnx_feature_t output); -nnx_error_code nnx_conv_3x3_update_dims(nnx_cfg_t *cfg, int h_out, int w_out, - int k_out, int k_in); -nnx_error_code nnx_conv_3x3_dw(nnx_cfg_t *cfg, nnx_weights_t weights, - nnx_feature_t input, nnx_feature_t output); -nnx_error_code nnx_conv_3x3_dw_update_dims(nnx_cfg_t *cfg, int h_out, int w_out, - int k_out, int k_in); - -#endif /* __NEUREKA_H__ */ diff --git a/neureka/inc/pulp_nnx_util.h b/neureka/inc/pulp_nnx_util.h deleted file mode 100644 index f29ff3e..0000000 --- a/neureka/inc/pulp_nnx_util.h +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Luka Macan - * - * Copyright 2023 ETH Zurich and University of Bologna - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * SPDX-License-Identifier: Apache-2.0 - */ - -#ifndef __PULP_NNX_UTIL__ -#define __PULP_NNX_UTIL__ - -void nnx_activate_gvsoc_logging(int use_dec); -void nnx_deactivate_gvsoc_logging(); - -#endif /* __PULP_NNX_UTIL__ */ diff --git a/neureka/src/pulp_nnx_hal.c b/neureka/src/pulp_nnx_hal.c deleted file mode 100644 index 1d99691..0000000 --- a/neureka/src/pulp_nnx_hal.c +++ /dev/null @@ -1,412 +0,0 @@ -/* - * Luka Macan - * Arpan Prasad - * - * Copyright 2023 ETH Zurich and University of Bologna - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * SPDX-License-Identifier: Apache-2.0 - */ - -#include "pulp_nnx_hal.h" -#include "pmsis.h" - -static int qw, weight_d0_stride, outbytes; - -// TODO For all the following functions we use __builtin_pulp_OffsetedWrite and -// __builtin_pulp_OffsetedRead instead of classic load/store because otherwise -// the compiler is not able to correctly factorize the NEUREKA base in case -// several accesses are done, ending up with twice more code - -// __builtin_pulp_OffsetedX not defined - needs further investigation... (too -// old PULP toolchain? used v1.0.16) It is used inside PULP-SDK... - -int nnx_empty() { return !NEUREKA_READ(NEUREKA_STATUS); } - -int nnx_full() { return NEUREKA_READ(NEUREKA_STATUS) == NEUREKA_STATUS_FULL; } - -int nnx_job_id() { return NEUREKA_READ(NEUREKA_RUNNING_JOB); } - -void nnx_soft_clear() { - NEUREKA_WRITE(NEUREKA_SOFT_CLEAR, 0); - for (volatile int i = 0; i < 10; i++) - ; -} - -int nnx_acquire() { - int job_id = -1; - NEUREKA_BARRIER_ACQUIRE(job_id); - return job_id; -} - -void nnx_offload(nnx_task_t *task) { - int *task_data = (int *)task; - for (int i = 0; i < sizeof(nnx_task_t) / 4; ++i) { - NEUREKA_WRITE_IO_REG(i * 4, task_data[i]); - } -} - -void nnx_offload_ptr(nnx_task_t *task) { - int *task_data = (int *)task; - for (int i = 0; i < 6; ++i) { - NEUREKA_WRITE_IO_REG(i * 4, task_data[i]); - } -} - -void nnx_run_async() { NEUREKA_WRITE(NEUREKA_TRIGGER, 0); } - -void nnx_run_blocking() { - nnx_run_async(); - nnx_wait_empty(); -} - -void nnx_commit() { - NEUREKA_WRITE(NEUREKA_TRIGGER, 1); // commit, no trigger -} - -void nnx_busywait() { NEUREKA_BUSYWAIT(); } - -void nnx_wait_empty() { - while (!nnx_empty()) - NEUREKA_BARRIER_NOSTATUS(); -} - -void nnx_wait_not_full() { - while (nnx_full()) - NEUREKA_BARRIER_NOSTATUS(); -} - -void nnx_wait_on_id(const int id) { - while (nnx_job_id() <= id) { - eu_evt_maskWaitAndClr(1 << NEUREKA_EVT0); - }; -} - -void nnx_task_init(nnx_task_t *task) { memset(task, 0, sizeof(nnx_task_t)); } - -int nnx_pad_input(nnx_cfg_t *cfg, const uint32_t top, const uint32_t right, - const uint32_t bottom, const uint32_t left, - const uint16_t value) { - uint32_t padding = 0; - uint32_t flags = 0; - - if (top > MAX_PAD || right > MAX_PAD || bottom > MAX_PAD || left > MAX_PAD) { - return 1; - } - - cfg->padding = - (top << 28) + (right << 24) + (bottom << 20) + (left << 16) + value; - - return 0; -} - -int nnx_norm_quant(nnx_cfg_t *cfg, const nnx_norm_t norm, - const nnx_quant_t quant) { - if (quant.shift_amount > 31) { - printf("ERROR! quant.shift_amount > 31\n"); - return 1; - } - - if (quant.mode == quantMode16Bit) { - printf("ERROR! quant.mode == quantMode16Bit\n"); - return 1; - } - - BIT_SET(cfg->conf0, NEUREKA_FLAG_NORM_QUANT | quant.function | quant.mode | - (quant.shift_amount << 16) | - quant.flag_rounding << NEUREKA_SHIFT_ROUNDING | - norm.mode | - norm.flag_bias << NEUREKA_SHIFT_FLAG_NORM_BIAS | - norm.flag_shift << NEUREKA_SHIFT_FLAG_NORM_SHIFT); - - return 0; -} - -void nnx_mask_filter(nnx_cfg_t *cfg, const uint8_t top, const uint8_t right, - const uint8_t bottom, const uint8_t left) { - cfg->filter_mask = ((uint32_t)top << 24) | ((uint32_t)right << 16) | - ((uint32_t)bottom << 8) | ((uint32_t)left << 0); -} - -nnx_error_code nnx_conv_1x1_update_dims(nnx_cfg_t *cfg, const int h_out, - const int w_out, const int k_out, - const int k_in) { - - const int num_Ko = divnceil(k_out, NEUREKA_OUTPUT_CHANNEL_THROUGHPUT); - const int num_Ki = divnceil(k_in, NEUREKA_INPUT_CHANNEL_THROUGHPUT); - const int num_Ho = divnceil(h_out, NEUREKA_FILTER_SIZE); - const int num_Wo = divnceil(w_out, NEUREKA_FILTER_SIZE); - - const int rem_Ko = remainder(k_out, NEUREKA_OUTPUT_CHANNEL_THROUGHPUT); - const int rem_Ki = remainder(k_in, NEUREKA_INPUT_CHANNEL_THROUGHPUT); - const int rem_Ho = remainder(h_out, NEUREKA_FILTER_SIZE); - const int rem_Wo = remainder(w_out, NEUREKA_FILTER_SIZE); - const int rem_Hi = rem_Ho; - const int rem_Wi = rem_Wo; - - const nnx_subtile_t subtile = { - .number = {.KoKi = concat_half(num_Ko, num_Ki), - .HoWo = concat_half(num_Ho, num_Wo)}, - .remainder = {.KoKi = concat_half(rem_Ko, rem_Ki), - .HoWo = concat_half(rem_Ho, rem_Wo), - .HiWi = concat_half(rem_Hi, rem_Wi)}}; - cfg->subtile = subtile; - - // Strides - const nnx_stride_t input_stride = { - .d0 = k_in, - .d1 = k_in * w_out, - .d2 = k_in * 3 * 3 // copying arpan - }; - cfg->input_stride = input_stride; - - const nnx_stride_t output_stride = { - .d0 = 32, .d1 = k_out * outbytes, .d2 = k_out * outbytes * w_out}; - cfg->output_stride = output_stride; - - const nnx_stride_t weights_stride = { - .d0 = weight_d0_stride * qw, - .d1 = weight_d0_stride * qw * num_Ki, - .d2 = 0 // Unused - }; - cfg->weights_stride = weights_stride; - - return 0; -} - -nnx_error_code nnx_conv_1x1(nnx_cfg_t *cfg, const nnx_weights_t weights, - const nnx_feature_t input, - const nnx_feature_t output) { - if (weights.bitwidth < 2 || weights.bitwidth > 8) { - return weightBitwidthOutOfBounds; - } - - if (weights.offset_mode != weightOffsetModeLayerWise) { - // Currently only layer-wise mode is used. - return unsupportedWeightOffsetMode; - } - - if ((input.bitwidth != featureBitwidth8Bit && - input.bitwidth != featureBitwidth16Bit) || - (output.bitwidth != featureBitwidth8Bit && - output.bitwidth != featureBitwidth32Bit)) { - return unsupportedFeatureBitwidth; - } - - if (input.height != output.height || input.width != output.width || - input.depth != weights.depth || output.depth != weights.n_weights) { - return dimensionMismatch; - } - - const int mode16 = - input.bitwidth == 16 ? NEUREKA_FLAG_MODE16 : NEUREKA_FLAG_MODE_BASIC; - - BIT_SET(cfg->conf0, weights.offset_mode | NEUREKA_FLAG_MODE_1x1 | mode16 | - (weights.bitwidth - 1)); - - // Global static variables needed by update_dims - outbytes = output.bitwidth / 8; - weight_d0_stride = - mode16 ? NEUREKA_WEIGHT_D0_STRIDE_MODE16 : NEUREKA_WEIGHT_D0_STRIDE_MODE8; - qw = weights.bitwidth; - - nnx_conv_1x1_update_dims(cfg, output.height, output.width, output.depth, - input.depth); - - // cfg->weight_offset_factor = SMALLEST_SIGNED(weights.bitwidth); - cfg->weight_offset_factor = weights.offset_factor; - - return 0; -} - -nnx_error_code nnx_conv_3x3_update_dims(nnx_cfg_t *cfg, const int h_out, - const int w_out, const int k_out, - const int k_in) { - - const int num_Ko = divnceil(k_out, NEUREKA_OUTPUT_CHANNEL_THROUGHPUT); - const int num_Ki = divnceil(k_in, NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3); - const int num_Ho = divnceil(h_out, NEUREKA_FILTER_SIZE); - const int num_Wo = divnceil(w_out, NEUREKA_FILTER_SIZE); - - const int rem_Ko = remainder(k_out, NEUREKA_OUTPUT_CHANNEL_THROUGHPUT); - const int rem_Ki = remainder(k_in, NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3); - const int rem_Ho = remainder(h_out, NEUREKA_FILTER_SIZE); - const int rem_Wo = remainder(w_out, NEUREKA_FILTER_SIZE); - const int rem_Hi = rem_Ho + 2; - const int rem_Wi = rem_Wo + 2; - - const nnx_subtile_t subtile = { - .number = {.KoKi = concat_half(num_Ko, num_Ki), - .HoWo = concat_half(num_Ho, num_Wo)}, - .remainder = {.KoKi = concat_half(rem_Ko, rem_Ki), - .HoWo = concat_half(rem_Ho, rem_Wo), - .HiWi = concat_half(rem_Hi, rem_Wi)}}; - cfg->subtile = subtile; - - // Strides - const nnx_stride_t input_stride = {.d0 = k_in, - .d1 = k_in * (w_out + 2), - .d2 = k_in * NEUREKA_FILTER_BUFFER_SIZE * - NEUREKA_FILTER_BUFFER_SIZE}; - cfg->input_stride = input_stride; - - const nnx_stride_t output_stride = { - .d0 = 32, .d1 = k_out * outbytes, .d2 = k_out * outbytes * w_out}; - cfg->output_stride = output_stride; - - const nnx_stride_t weights_stride = { - .d0 = NEUREKA_WEIGHT_D0_STRIDE_MODE8_3x3, - .d1 = NEUREKA_WEIGHT_D0_STRIDE_MODE8_3x3 * qw * num_Ki, - .d2 = 0 // Unused - }; - cfg->weights_stride = weights_stride; - - return 0; -} - -nnx_error_code nnx_conv_3x3(nnx_cfg_t *cfg, const nnx_weights_t weights, - const nnx_feature_t input, - const nnx_feature_t output) { - if (weights.bitwidth < 2 || weights.bitwidth > 8) { - return weightBitwidthOutOfBounds; - } - - if (weights.offset_mode != weightOffsetModeLayerWise) { - // Currently only layer-wise mode is used. - return unsupportedWeightOffsetMode; - } - - if ((input.bitwidth != featureBitwidth8Bit && - input.bitwidth != featureBitwidth16Bit) || - (output.bitwidth != featureBitwidth8Bit && - output.bitwidth != featureBitwidth32Bit)) { - return unsupportedFeatureBitwidth; - } - - if (input.height - 2 != output.height || input.width - 2 != output.width || - input.depth != weights.depth || output.depth != weights.n_weights) { - return dimensionMismatch; - } - - const int mode16 = - input.bitwidth == 16 ? NEUREKA_FLAG_MODE16 : NEUREKA_FLAG_MODE_BASIC; - - BIT_SET(cfg->conf0, weights.offset_mode | NEUREKA_FLAG_MODE_3x3 | mode16 | - (weights.bitwidth - 1)); - - // Global static variables needed by update_dims - outbytes = output.bitwidth / 8; - weight_d0_stride = - mode16 ? NEUREKA_WEIGHT_D0_STRIDE_MODE16 : NEUREKA_WEIGHT_D0_STRIDE_MODE8; - qw = weights.bitwidth; - - nnx_conv_3x3_update_dims(cfg, output.height, output.width, output.depth, - input.depth); - - // cfg->weight_offset_factor = SMALLEST_SIGNED(weights.bitwidth); - cfg->weight_offset_factor = weights.offset_factor; - - return 0; -} - -nnx_error_code nnx_conv_3x3_dw_update_dims(nnx_cfg_t *cfg, const int h_out, - const int w_out, const int k_out, - const int k_in) { - - const int num_Ko = divnceil(k_out, NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3); - const int num_Ki = num_Ko; - const int num_Ho = divnceil(h_out, NEUREKA_FILTER_SIZE); - const int num_Wo = divnceil(w_out, NEUREKA_FILTER_SIZE); - - const int rem_Ko = remainder(k_out, NEUREKA_INPUT_CHANNEL_THROUGHPUT_3x3); - const int rem_Ki = rem_Ko; - const int rem_Ho = remainder(h_out, NEUREKA_FILTER_SIZE); - const int rem_Wo = remainder(w_out, NEUREKA_FILTER_SIZE); - const int rem_Hi = rem_Ho + 2; - const int rem_Wi = rem_Wo + 2; - - const nnx_subtile_t subtile = { - .number = {.KoKi = concat_half(num_Ko, num_Ki), - .HoWo = concat_half(num_Ho, num_Wo)}, - .remainder = {.KoKi = concat_half(rem_Ko, rem_Ki), - .HoWo = concat_half(rem_Ho, rem_Wo), - .HiWi = concat_half(rem_Hi, rem_Wi)}}; - cfg->subtile = subtile; - - // Strides - const nnx_stride_t input_stride = { - .d0 = k_out, - .d1 = k_out * (w_out + 2), - .d2 = 0 // Unused - }; - cfg->input_stride = input_stride; - - const nnx_stride_t output_stride = { - .d0 = 32, .d1 = k_out * outbytes, .d2 = k_out * outbytes * w_out}; - cfg->output_stride = output_stride; - - const nnx_stride_t weights_stride = { - .d0 = NEUREKA_FILTER_SIZE * NEUREKA_FILTER_SIZE * weight_d0_stride, - .d1 = 0, - .d2 = 0 // Unused - }; - cfg->weights_stride = weights_stride; - - return 0; -} - -nnx_error_code nnx_conv_3x3_dw(nnx_cfg_t *cfg, const nnx_weights_t weights, - const nnx_feature_t input, - const nnx_feature_t output) { - if (weights.bitwidth < 2 || weights.bitwidth > 8) { - return weightBitwidthOutOfBounds; - } - - if (weights.offset_mode != weightOffsetModeLayerWise) { - // Currently only layer-wise mode is used. - return unsupportedWeightOffsetMode; - } - - if ((input.bitwidth != featureBitwidth8Bit && - input.bitwidth != featureBitwidth16Bit) || - (output.bitwidth != featureBitwidth8Bit && - output.bitwidth != featureBitwidth32Bit)) { - return unsupportedFeatureBitwidth; - } - - if (input.height - 2 != output.height || input.width - 2 != output.width || - input.depth != output.depth) { - return dimensionMismatch; - } - - const int mode16 = - input.bitwidth == 16 ? NEUREKA_FLAG_MODE16 : NEUREKA_FLAG_MODE_BASIC; - - BIT_SET(cfg->conf0, weights.offset_mode | NEUREKA_FLAG_MODE_3x3_DW | mode16 | - (weights.bitwidth - 1)); - - // Global static variables needed by update_dims - outbytes = output.bitwidth / 8; - weight_d0_stride = - mode16 ? NEUREKA_WEIGHT_D0_STRIDE_MODE16 : NEUREKA_WEIGHT_D0_STRIDE_MODE8; - qw = weights.bitwidth; - - nnx_conv_3x3_dw_update_dims(cfg, output.height, output.width, output.depth, - input.depth); - - // cfg->weight_offset_factor = SMALLEST_SIGNED(weights.bitwidth); - cfg->weight_offset_factor = weights.offset_factor; - - return 0; -} diff --git a/src/pulp_nnx_neureka.c b/src/pulp_nnx_neureka.c new file mode 100644 index 0000000..1efb34f --- /dev/null +++ b/src/pulp_nnx_neureka.c @@ -0,0 +1,131 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "pulp_nnx_neureka.h" +#include "hwpe.h" +#include "neureka.h" +#include "pulp_nnx_util.h" +#include +#include +#include + +void neureka_nnx_init(neureka_dev_t *dev, neureka_pulp_conf_t *conf) { + neureka_pulp_open(conf); + hwpe_soft_clear(&dev->hwpe_dev); +} + +void neureka_nnx_term(neureka_dev_t *dev) { + hwpe_soft_clear(&dev->hwpe_dev); + neureka_pulp_close(); +} + +int neureka_nnx_dispatch_check(neureka_dev_t *dev) { + return !neureka_task_queue_full(dev); +} + +void neureka_nnx_dispatch_wait(neureka_dev_t *dev) { + while (!neureka_nnx_dispatch_check(dev)) { + neureka_pulp_event_wait_and_clear(); + } +} + +int neureka_nnx_dispatch(neureka_dev_t *dev, neureka_task_t *task) { + if (hwpe_task_queue_acquire_task(&dev->hwpe_dev, &task->id)) { + return 1; + } + hwpe_task_queue_write_task(&dev->hwpe_dev, (uint32_t *)&task->data, + (int)(sizeof(neureka_task_data_t) / 4)); + hwpe_task_queue_release_and_run(&dev->hwpe_dev); + return 0; +} + +int neureka_nnx_resolve_check(neureka_dev_t *dev, neureka_task_t *task) { +#if __PLATFORM__ == ARCHI_PLATFORM_GVSOC + // GVSOC model has a broken running_id so resolve_check + // conservativly looks if the task queue is empty. + return neureka_task_queue_empty(dev); +#else + uint8_t prev_task_id = task->id - 1; + return !(hwpe_last_task_id(&dev->hwpe_dev) == prev_task_id || + (hwpe_last_task_id(&dev->hwpe_dev) == task->id && + !neureka_task_queue_empty(dev))); +#endif +} + +void neureka_nnx_resolve_wait(neureka_dev_t *dev, neureka_task_t *task) { + while (!neureka_nnx_resolve_check(dev, task)) { + neureka_pulp_event_wait_and_clear(); + } +} + +static inline uint32_t _get_tile_ptr(uint32_t ptr, int i, int j, int size_i, + uint32_t size_j, uint32_t size_k, + uint32_t stride_j, uint32_t stride_k, + uint32_t overlap_i, uint32_t overlap_j, + uint32_t offset_i, uint32_t offset_j, + uint8_t data_size) { + return ptr + + (i * (size_i - overlap_i) - offset_i) * stride_j * stride_k * + data_size / 8 + + (j * (size_j - overlap_j) - offset_j) * stride_k * data_size / 8; +} + +void neureka_nnx_dispatch_stride2x2( + neureka_dev_t *dev, neureka_task_t *task, const uint32_t w_in, const uint32_t k_in, + const uint32_t w_in_stride, const uint32_t k_in_stride, + const uint32_t h_out, const uint32_t w_out, const uint32_t k_out, + const uint32_t w_out_stride, const uint32_t k_out_stride, + const uint8_t h_ker, const uint8_t w_ker) { + const uint8_t stride = 2; + const uint8_t bits = 8; + + const uint32_t n_h = divnceil(h_out, stride); + const uint32_t n_w = divnceil(w_out, stride); + const uint32_t input_height_offset = h_out % stride == 1 ? stride : 0; + const uint32_t input_width_offset = w_out % stride == 1 ? stride : 0; + const uint32_t output_height_offset = h_out % stride == 1 ? 1 : 0; + const uint32_t output_width_offset = w_out % stride == 1 ? 1 : 0; + + const uint32_t input_base = task->data.infeat_ptr; + const uint32_t output_base = task->data.outfeat_ptr; + const uint32_t tile_padding = task->data.cfg.padding; + + for (int i = 0; i < n_h; i++) { + for (int j = 0; j < n_w; j++) { + task->data.infeat_ptr = + _get_tile_ptr(input_base, i, j, 3 + h_ker - 1, 3 + w_ker - 1, k_in, + w_in_stride, k_in_stride, h_ker - stride, + w_ker - stride, i == 0 ? 0 : input_height_offset, + j == 0 ? 0 : input_width_offset, bits); + task->data.outfeat_ptr = + _get_tile_ptr(output_base, i, j, 2, 2, k_out, w_out_stride, + k_out_stride, 0, 0, i == 0 ? 0 : output_height_offset, + j == 0 ? 0 : output_width_offset, bits); + + task->data.cfg.padding = + neureka_get_tile_padding(tile_padding, i, j, n_h, n_w); + + // Altered dispatch to wait if cannot acquire + while (neureka_nnx_dispatch(dev, task)) { + neureka_pulp_event_wait_and_clear(); + } + } + } +}