From c764efe5bb00304954a0560ba295920677563c2e Mon Sep 17 00:00:00 2001 From: Luka Macan Date: Sun, 14 Jan 2024 18:14:19 +0100 Subject: [PATCH] Decouple pulp-sdk and add hwpe device structure (#1) * Add hwpe interface and decouple pulp-sdk from hal * Fixes and changes to test Fixed/ignored some pyright bugs. Made generated data files external which additionally required touching of the nnx_layer.c file before compiling to have correct compilation but still without `make clean`. Added `-flto` flag to compilation. Some formatting fixes. --- CHANGELOG.md | 23 ++ inc/pulp_nnx.h | 75 ------- inc/pulp_nnx_ne16.h | 77 +++++++ ne16/bsp/ne16_pulp_bsp.c | 85 ++++++++ ne16/bsp/ne16_pulp_bsp.h | 81 +++++++ .../{ne16_gvsoc_logging.h => ne16_gvsoc.h} | 35 ++-- ne16/hal/ne16.c | 39 ++++ ne16/hal/ne16.h | 36 ++++ ne16/hal/ne16_defs.h | 157 -------------- ne16/hal/ne16_hal.h | 197 ------------------ ne16/hal/{ne16_hal.c => ne16_task.c} | 152 +++++++------- ne16/hal/ne16_task.h | 172 +++++++++++++++ ne16/hal/ne16_task_defs.h | 107 ++++++++++ src/pulp_nnx_ne16.c | 185 ++++------------ test/.gitignore | 2 +- test/HeaderWriter.py | 52 ++++- test/Ne16TestClasses.py | 38 ++-- test/TestClasses.py | 4 +- test/app/Makefile | 38 ++-- test/app/src/main.c | 10 +- test/app/src/nnx_layer.c | 106 +++++----- test/conftest.py | 2 +- test/test.py | 2 + util/hwpe.c | 85 ++++++++ util/hwpe.h | 43 ++++ 25 files changed, 1044 insertions(+), 759 deletions(-) delete mode 100644 inc/pulp_nnx.h create mode 100644 inc/pulp_nnx_ne16.h create mode 100644 ne16/bsp/ne16_pulp_bsp.c create mode 100644 ne16/bsp/ne16_pulp_bsp.h rename ne16/gvsoc/{ne16_gvsoc_logging.h => ne16_gvsoc.h} (51%) create mode 100644 ne16/hal/ne16.c create mode 100644 ne16/hal/ne16.h delete mode 100644 ne16/hal/ne16_defs.h delete mode 100644 ne16/hal/ne16_hal.h rename ne16/hal/{ne16_hal.c => ne16_task.c} (61%) create mode 100644 ne16/hal/ne16_task.h create mode 100644 ne16/hal/ne16_task_defs.h create mode 100644 util/hwpe.c create mode 100644 util/hwpe.h diff --git a/CHANGELOG.md b/CHANGELOG.md index 789566b..623a775 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,28 @@ # Changelog +## [Unreleased] + +### Added + +- New Hardware Processing Engine (HWPE) device in `util/hwpe.h` +- A device structure for ne16 `ne16_dev_t` in `ne16/hal/ne16.h` which extends the hwpe device +- Test app Makefile has now an `ACCELERATOR` variable to specify which accelerator is used + +### Changed + +- Library functions no longer start with a generic `nnx_` prefix but with `_nnx_` prefix + to allow for usage of multiple kinds of accelerators in the same system +- Decoupled board specific functionality into `ne16/bsp` which also contains constant global structures + to the implementations of the `ne16_dev_t` structure +- Moved all task related functions (`nnx_task_set_dims*`) into `ne16/hal/ne16_task.c` +- Tests adjusted for the new interface +- Test data generation moved into source files with extern declarations to check the output from the main + +### Fixed + +- pyright errors +- formatting errors + ## [0.2.1] - 2024-01-08 ### Fixed diff --git a/inc/pulp_nnx.h b/inc/pulp_nnx.h deleted file mode 100644 index 312eaed..0000000 --- a/inc/pulp_nnx.h +++ /dev/null @@ -1,75 +0,0 @@ -/* - * Luka Macan - * - * Copyright 2023 ETH Zurich and University of Bologna - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * SPDX-License-Identifier: Apache-2.0 - */ - -#ifndef __PULP_NNX_H__ -#define __PULP_NNX_H__ - -#include - -typedef struct nnx_task_t nnx_task_t; -typedef struct nnx_norm_t nnx_norm_t; -typedef struct nnx_quant_t nnx_quant_t; -typedef enum nnx_weight_offset_mode_e nnx_weight_offset_mode_e; - -void nnx_init(uint32_t max_stall); -void nnx_term(); -int nnx_dispatch_check(); -void nnx_dispatch_check_blocking(); -void nnx_dispatch_task(nnx_task_t *task); -int nnx_resolve_check(nnx_task_t *task); -void nnx_resolve_check_blocking(nnx_task_t *task); - -void nnx_task_init(nnx_task_t *task, const uint8_t kernel_shape, - const uint8_t depthwise, const uint8_t input_bits, - const uint8_t output_bits, const uint8_t weights_bits, - nnx_weight_offset_mode_e weights_offset_mode, - const uint32_t weights_offset_factor, nnx_quant_t quant, - nnx_norm_t norm, const uint8_t stride); -uint32_t nnx_pad_ptr(uint32_t ptr, const uint32_t width, const uint32_t channel, - const uint8_t bits, const uint8_t padding_top, - const uint8_t padding_left); -void nnx_task_set_ptrs(nnx_task_t *task, uint32_t input_ptr, uint32_t w_in, - uint32_t k_in, uint8_t bits_in, uint8_t padding_top, - uint8_t padding_left, uint32_t output_ptr, - uint32_t weights_ptr, uint32_t scale_ptr, - uint32_t shift_ptr, uint32_t bias_ptr); -void nnx_task_set_dims(nnx_task_t *task, const uint32_t w_in, - const uint32_t k_in, const uint32_t w_in_stride, - const uint32_t k_in_stride, const uint32_t h_out, - const uint32_t w_out, const uint32_t k_out, - const uint32_t w_out_stride, const uint32_t k_out_stride, - const uint8_t padding_top, const uint8_t padding_bottom, - const uint8_t padding_right, const uint8_t padding_left); -void nnx_task_set_dims_stride2x2( - nnx_task_t *task, const uint32_t h_in, const uint32_t w_in, - const uint32_t k_in, const uint32_t w_in_stride, const uint32_t k_in_stride, - const uint32_t h_out, const uint32_t w_out, const uint32_t k_out, - const uint32_t w_out_stride, const uint32_t k_out_stride, - const uint8_t h_ker, const uint8_t w_ker, const uint8_t padding_top, - const uint8_t padding_bottom, const uint8_t padding_right, - const uint8_t padding_left); -void nnx_dispatch_task_stride2x2( - nnx_task_t *task, const uint32_t w_in, const uint32_t k_in, - const uint32_t w_in_stride, const uint32_t k_in_stride, - const uint32_t h_out, const uint32_t w_out, const uint32_t k_out, - const uint32_t w_out_stride, const uint32_t k_out_stride, - const uint8_t h_ker, const uint8_t w_ker); - -#endif // __PULP_NNX_H__ diff --git a/inc/pulp_nnx_ne16.h b/inc/pulp_nnx_ne16.h new file mode 100644 index 0000000..eff9a60 --- /dev/null +++ b/inc/pulp_nnx_ne16.h @@ -0,0 +1,77 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "ne16.h" +#include "ne16_pulp_bsp.h" +#include "ne16_task.h" +#include + +/* PULP-NNX interface */ + +void ne16_nnx_init(ne16_dev_t *dev, ne16_pulp_conf_t *conf); +void ne16_nnx_term(ne16_dev_t *dev); + +/** ne16_nnx_dispatch_check + * + * Check whether you can dispatch to the accelerator. + */ +int ne16_nnx_dispatch_check(ne16_dev_t *dev); + +/** ne16_nnx_dispatch_wait + * + * Block until you can dispatch to the accelerator. + */ +void ne16_nnx_dispatch_wait(ne16_dev_t *dev); + +/** ne16_nnx_dispatch + * + * Dispatch a task to the accelerator. + * Fails with return code 1 if the task cannot be dispatched. Otherwise returns 0. + */ +int ne16_nnx_dispatch(ne16_dev_t *dev, ne16_task_t *task); + +/** ne16_nnx_resolve_check + * + * Check whether the task has been resolved. + */ +int ne16_nnx_resolve_check(ne16_dev_t *dev, ne16_task_t *task); + +/** ne16_nnx_resolve_wait + * + * Block until you can resolve the task. + */ +void ne16_nnx_resolve_wait(ne16_dev_t *dev, ne16_task_t *task); + + +/* Additional helper functions */ + +/** ne16_nnx_dispatch_stride2x2 + * + * It uses NE16's 2x2 strided mode which reduces the number of writes NE16 does. + * This mode doesn't stride the NE16's subtile input pointer, so we have to + * tile the tile to the subtile's spatial dimensions (in this case 3x3 output). + * Works only if the k_out is divisible by 2. + */ +void ne16_nnx_dispatch_stride2x2( + ne16_dev_t *dev, ne16_task_t *task, const uint32_t w_in, const uint32_t k_in, + const uint32_t w_in_stride, const uint32_t k_in_stride, + const uint32_t h_out, const uint32_t w_out, const uint32_t k_out, + const uint32_t w_out_stride, const uint32_t k_out_stride, + const uint8_t h_ker, const uint8_t w_ker); diff --git a/ne16/bsp/ne16_pulp_bsp.c b/ne16/bsp/ne16_pulp_bsp.c new file mode 100644 index 0000000..a170720 --- /dev/null +++ b/ne16/bsp/ne16_pulp_bsp.c @@ -0,0 +1,85 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "ne16_pulp_bsp.h" +#include + +#define NE16_PULP_CLUSTER_CTRL_ADDR_BASE (0x00200000) +#define NE16_PULP_CLUSTER_CTRL_HWPE_OFFS 0x18 +#define NE16_PULP_CLUSTER_CTRL_HWPE_ADDR \ + (NE16_PULP_CLUSTER_CTRL_ADDR_BASE + NE16_PULP_CLUSTER_CTRL_HWPE_OFFS) +#define NE16_PULP_CLUSTER_CTRL_HWPE_MASK_CG_EN 0x800 +#define NE16_PULP_CLUSTER_CTRL_HWPE_MASK_HCI_PRIO 0x100 +#define NE16_PULP_CLUSTER_CTRL_HWPE_MASK_HCI_MAXSTALL 0xff +#define NE16_PULP_MAX_STALL (8) +#define NE16_PULP_EVENT (1 << 12) +#define NE16_PULP_BASE_ADDR (0x00201000) + +void ne16_pulp_cg_enable() { + *(volatile uint32_t *)NE16_PULP_CLUSTER_CTRL_HWPE_ADDR |= + NE16_PULP_CLUSTER_CTRL_HWPE_MASK_CG_EN; +} + +void ne16_pulp_cg_disable() { + *(volatile uint32_t *)NE16_PULP_CLUSTER_CTRL_HWPE_ADDR &= + ~NE16_PULP_CLUSTER_CTRL_HWPE_MASK_CG_EN; +} + +void ne16_pulp_hci_setpriority_ne16() { + *(volatile uint32_t *)NE16_PULP_CLUSTER_CTRL_HWPE_ADDR |= + NE16_PULP_CLUSTER_CTRL_HWPE_MASK_HCI_PRIO; +} + +void ne16_pulp_hci_setpriority_core() { + *(volatile uint32_t *)NE16_PULP_CLUSTER_CTRL_HWPE_ADDR &= + ~NE16_PULP_CLUSTER_CTRL_HWPE_MASK_HCI_PRIO; +} + +void ne16_pulp_hci_reset_max_stall() { + *(volatile uint32_t *)NE16_PULP_CLUSTER_CTRL_HWPE_ADDR &= + ~NE16_PULP_CLUSTER_CTRL_HWPE_MASK_HCI_MAXSTALL; +} + +void ne16_pulp_hci_set_max_stall(uint32_t max_stall) { + *(volatile uint32_t *)NE16_PULP_CLUSTER_CTRL_HWPE_ADDR |= + max_stall & NE16_PULP_CLUSTER_CTRL_HWPE_MASK_HCI_MAXSTALL; +} + +void ne16_pulp_open(ne16_pulp_conf_t *conf) { + ne16_pulp_cg_enable(); + ne16_pulp_hci_setpriority_ne16(); + ne16_pulp_hci_set_max_stall(conf->max_stall); +} + +void ne16_pulp_close() { + ne16_pulp_hci_reset_max_stall(); + ne16_pulp_hci_setpriority_core(); + ne16_pulp_cg_disable(); +} + +void ne16_pulp_event_wait_and_clear() { + eu_evt_maskWaitAndClr(NE16_PULP_EVENT); +} + +static const ne16_dev_t ne16_pulp_dev = { + .hwpe_dev = (struct hwpe_dev_t){ + .base_addr = (volatile uint32_t *)NE16_PULP_BASE_ADDR}}; + +const ne16_dev_t *ne16_pulp_get_dev() { return &ne16_pulp_dev; } diff --git a/ne16/bsp/ne16_pulp_bsp.h b/ne16/bsp/ne16_pulp_bsp.h new file mode 100644 index 0000000..8f1bc0a --- /dev/null +++ b/ne16/bsp/ne16_pulp_bsp.h @@ -0,0 +1,81 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __NE16_PULP_BSP_H__ +#define __NE16_PULP_BSP_H__ + +#include "ne16.h" +#include + +/** + * ne16_pulp_cg_enable + * + * Enable clock gating of the NE16. + */ +void ne16_pulp_cg_enable(); + +/** + * ne16_pulp_cg_enable + * + * Disable clock gating of the NE16. + */ +void ne16_pulp_cg_disable(); + +/** + * ne16_pulp_setpriority_ne16 + * + * Set HCI interconnect bus priority to prioritize NE16. + */ +void ne16_pulp_hci_setpriority_ne16(); + +/** + * ne16_pulp_setpriority_core + * + * Set HCI bus priority to prioritize cores. + */ +void ne16_pulp_hci_setpriority_core(); + +/** + * ne16_pulp_hci_reset_maxstall + * + * Reset the HCI bus maxstall parameter. + * TODO: Check if it disables it also or just resets? + */ +void ne16_pulp_hci_reset_max_stall(); + +/** + * ne16_pulp_hci_set_maxstall + * + * Set the HCI bus maxstall. Maxstall defines how many cycles + * will the HCI bus stall the lower priority master, i.e. ne16 or core, + * before letting it do a transaction. + */ +void ne16_pulp_hci_set_max_stall(uint32_t max_stall); + +typedef struct ne16_pulp_conf_t { + int max_stall; +} ne16_pulp_conf_t; + +void ne16_pulp_open(ne16_pulp_conf_t *conf); +void ne16_pulp_close(); +void ne16_pulp_event_wait_and_clear(); +const ne16_dev_t *ne16_pulp_get_dev(); + +#endif // !__NE16_PULP_BSP_H__ diff --git a/ne16/gvsoc/ne16_gvsoc_logging.h b/ne16/gvsoc/ne16_gvsoc.h similarity index 51% rename from ne16/gvsoc/ne16_gvsoc_logging.h rename to ne16/gvsoc/ne16_gvsoc.h index 19db8b5..f6626fd 100644 --- a/ne16/gvsoc/ne16_gvsoc_logging.h +++ b/ne16/gvsoc/ne16_gvsoc.h @@ -18,15 +18,19 @@ * SPDX-License-Identifier: Apache-2.0 */ -#ifndef __NE16_GVSOC_LOGGING_H__ -#define __NE16_GVSOC_LOGGING_H__ +#ifndef __NE16_GVSOC_H__ +#define __NE16_GVSOC_H__ -#include "ne16_hal.h" +#include "ne16.h" +#include "ne16_task.h" -typedef enum ne16_gvsoc_logging_format_e { - NE16_GVSOC_LOGGING_FORMAT_DECIMAL = 0, - NE16_GVSOC_LOGGING_FORMAT_HEXADECIMAL = 3 -} ne16_gvsoc_logging_format_e; +#define NE16_REG_GVSOC_LOG_LEVEL 24 +#define NE16_REG_GVSOC_LOG_FORMAT 25 + +typedef enum ne16_gvsoc_log_format_e { + NE16_GVSOC_LOG_FORMAT_DECIMAL = 0, + NE16_GVSOC_LOG_FORMAT_HEXADECIMAL = 3 +} ne16_gvsoc_log_format_e; typedef enum ne16_gvsoc_log_level_e { NE16_GVSOC_LOG_LEVEL_CONFIG = 0, @@ -35,15 +39,16 @@ typedef enum ne16_gvsoc_log_level_e { NE16_GVSOC_LOG_LEVEL_ALL = 3 } ne16_gvsoc_log_level_e; -static inline void -ne16_activate_gvsoc_logging(ne16_gvsoc_log_level_e log_level, - ne16_gvsoc_logging_format_e format) { - NE16_WRITE_IO_REG(sizeof(nnx_task_data_t), log_level); - NE16_WRITE_IO_REG(sizeof(nnx_task_data_t) + 4, format); +static void ne16_gvsoc_log_activate(ne16_dev_t *dev, + ne16_gvsoc_log_level_e log_level, + ne16_gvsoc_log_format_e format) { + hwpe_task_reg_write(&dev->hwpe_dev, NE16_REG_GVSOC_LOG_LEVEL, log_level); + hwpe_task_reg_write(&dev->hwpe_dev, NE16_REG_GVSOC_LOG_FORMAT, format); } -static inline void ne16_deactivate_gvsoc_logging() { - NE16_WRITE_IO_REG(sizeof(nnx_task_data_t), 0); +static void ne16_gvsoc_log_deactivate(ne16_dev_t *dev) { + hwpe_task_reg_write(&dev->hwpe_dev, NE16_REG_GVSOC_LOG_LEVEL, + NE16_GVSOC_LOG_LEVEL_CONFIG); } -#endif // __NE16_GVSOC_LOGGING_H__ +#endif // __NE16_GVSOC_H__ diff --git a/ne16/hal/ne16.c b/ne16/hal/ne16.c new file mode 100644 index 0000000..97859b4 --- /dev/null +++ b/ne16/hal/ne16.c @@ -0,0 +1,39 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "ne16.h" + +#define NE16_STATUS_EMPTY (0x000) +#define NE16_STATUS_FULL (0x101) + +inline int ne16_task_queue_size(ne16_dev_t *dev) { return 2; } + +inline int ne16_task_queue_tasks_in_flight(ne16_dev_t *dev) { + uint32_t status = hwpe_task_queue_status(&dev->hwpe_dev); + return (status & 0x1) + ((status >> 8) & 0x1); +} + +inline int ne16_task_queue_empty(ne16_dev_t *dev) { + return hwpe_task_queue_status(&dev->hwpe_dev) == NE16_STATUS_EMPTY; +} + +inline int ne16_task_queue_full(ne16_dev_t *dev) { + return hwpe_task_queue_status(&dev->hwpe_dev) == NE16_STATUS_FULL; +} diff --git a/ne16/hal/ne16.h b/ne16/hal/ne16.h new file mode 100644 index 0000000..c4c3a19 --- /dev/null +++ b/ne16/hal/ne16.h @@ -0,0 +1,36 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __NE16_H__ +#define __NE16_H__ + +#include "hwpe.h" +#include + +typedef struct ne16_dev_t { + hwpe_dev_t hwpe_dev; /* Implements the HWPE device interface */ +} ne16_dev_t; + +int ne16_task_queue_size(ne16_dev_t *dev); +int ne16_task_queue_tasks_in_flight(ne16_dev_t *dev); +int ne16_task_queue_empty(ne16_dev_t *dev); +int ne16_task_queue_full(ne16_dev_t *dev); + +#endif // __NE16_H__ diff --git a/ne16/hal/ne16_defs.h b/ne16/hal/ne16_defs.h deleted file mode 100644 index 7aeb993..0000000 --- a/ne16/hal/ne16_defs.h +++ /dev/null @@ -1,157 +0,0 @@ -/* - * Luka Macan - * - * Copyright 2023 ETH Zurich and University of Bologna - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * SPDX-License-Identifier: Apache-2.0 - */ - -#ifndef __NE16_DEFS_H__ -#define __NE16_DEFS_H__ - -/* ARHITECTURE */ - -#define NE16_FILTER_SIZE (3) -#define NE16_FILTER_BUFFER_SIZE (5) -#define NE16_INPUT_CHANNEL_THROUGHPUT (16) -#define NE16_OUTPUT_CHANNEL_THROUGHPUT (32) -#define NE16_CONTEXT_SIZE (2) - -#define NE16_WEIGHT_D0_STRIDE_MODE8 (2) -#define NE16_WEIGHT_D0_STRIDE_MODE16 (1) - -/* REGISTER MAP */ - -#define NE16_EVT0 (1 << 12) -#define NE16_EVT1 (1 << 13) -#define NE16_BASE_ADDR (0x00201000) - -/* CLUSTER */ - -#define CLUSTER_CTRL_ADDR_BASE (0x00200000) - -/* CLUSTER_HWPE */ - -#define CLUSTER_CTRL_HWPE_OFFS 0x18 - -#define CLUSTER_CTRL_HWPE_ADDR (CLUSTER_CTRL_ADDR_BASE + CLUSTER_CTRL_HWPE_OFFS) - -#define CLUSTER_CTRL_HWPE_MASK_CG_EN 0x800 -#define CLUSTER_CTRL_HWPE_MASK_HCI_PRIO 0x100 -#define CLUSTER_CTRL_HWPE_MASK_HCI_MAXSTALL 0xff - -/* REGISTER OFFSETS */ - -// commands -#define NE16_TRIGGER 0x00 -#define NE16_ACQUIRE 0x04 -#define NE16_FINISHED 0x08 -#define NE16_STATUS 0x0C -#define NE16_RUNNING_JOB 0x10 -#define NE16_SOFT_CLEAR 0x14 -#define NE16_SWSYNC 0x18 -#define NE16_URISCY_IMEM 0x1C - -// job configuration -#define NE16_REGISTER_OFFSET 0x20 - -#define NE16_REG_WEIGHTS_PTR 0x00 -#define NE16_REG_INFEAT_PTR 0x04 -#define NE16_REG_OUTFEAT_PTR 0x08 -#define NE16_REG_SCALE_PTR 0x0C -#define NE16_REG_SCALE_SHIFT_PTR 0x10 -#define NE16_REG_SCALE_BIAS_PTR 0x14 -#define NE16_REG_INFEAT_D0_STRIDE 0x18 -#define NE16_REG_INFEAT_D1_STRIDE 0x1C -#define NE16_REG_INFEAT_D2_STRIDE 0x20 -#define NE16_REG_OUTFEAT_D0_STRIDE 0x24 -#define NE16_REG_OUTFEAT_D1_STRIDE 0x28 -#define NE16_REG_OUTFEAT_D2_STRIDE 0x2C -#define NE16_REG_WEIGHTS_D0_STRIDE 0x30 -#define NE16_REG_WEIGHTS_D1_STRIDE 0x34 -#define NE16_REG_WEIGHTS_D2_STRIDE 0x38 -#define NE16_REG_SUBTILE_REMAINDER_0 0x3C -#define NE16_REG_SUBTILE_REMAINDER_1 0x40 -#define NE16_REG_SUBTILE_REMAINDER_2 0x44 -#define NE16_REG_SUBTILE_NUMBER_0 0x48 -#define NE16_REG_SUBTILE_NUMBER_1 0x4C -#define NE16_REG_PADDING 0x50 -#define NE16_REG_WEIGHT_OFFSET_FACTOR 0x54 -#define NE16_REG_FILTER_MASKING 0x58 -#define NE16_REG_CONF0 0x5C - -/* SHIFT */ - -#define NE16_SHIFT_FLAG_NORM_BIAS (25) -#define NE16_SHIFT_FLAG_NORM_SHIFT (24) -#define NE16_SHIFT_ROUNDING (11) - -/* CONF0 FLAGS */ - -#define NE16_FLAG_NORM_BIAS (1 << 25) -#define NE16_FLAG_NORM_SHIFT (1 << 24) -#define NE16_FLAG_QUANT_FUNCTION_IDENTITY (1 << 23) -#define NE16_FLAG_QUANT_FUNCTION_RELU (0 << 23) -#define NE16_QUANT_MODE_8BIT (0 << 21) -#define NE16_QUANT_MODE_16BIT (1 << 21) -#define NE16_QUANT_MODE_32BIT (2 << 21) -// conf0[20:16] - quantization shift amount -#define NE16_FLAG_WEIGHT_OFFSET_SYMMETRIC (0 << 15) -#define NE16_FLAG_WEIGHT_OFFSET_LAYER_WISE (1 << 15) -#define NE16_FLAG_STREAMIN (1 << 14) -#define NE16_NORM_MODE_8BIT (0 << 12) -#define NE16_NORM_MODE_16BIT (1 << 12) -#define NE16_NORM_MODE_32BIT (2 << 12) -#define NE16_FLAG_ROUND (1 << 11) -#define NE16_FLAG_STRIDE_2x2 (1 << 8) -#define NE16_FLAG_LINEAR_MODE (1 << 7) -#define NE16_FLAG_MODE_3x3 (0 << 5) -#define NE16_FLAG_MODE_3x3_DW (1 << 5) -#define NE16_FLAG_MODE_1x1 (2 << 5) -#define NE16_FLAG_NORM_QUANT (1 << 4) -#define NE16_FLAG_MODE_BASIC (0 << 3) -#define NE16_FLAG_MODE16 (1 << 3) - -/* Masks */ - -#define NE16_MASK_QUANT_FUNCTION (1 << 23) -#define NE16_MASK_QUANT_MODE (3 << 21) - -/* PADDING */ - -#define NE16_DONT_PAD (0) -#define NE16_MAX_PAD (2) - -/* NORM */ -#define NE16_NORM_MAX_LEN (32) -#define NE16_NO_NORM(length) \ - { \ - .scale = scale_identity, .bias = NE16_NULL, .shift = NE16_NULL, \ - .length = length, .mode = normMode32Bit \ - } - -/* QUANT */ -#define NE16_NO_QUANT \ - { \ - .shift_amount = 0, .mode = quantMode32Bit, \ - .function = quantFunctionIdentity \ - } - -/* NULL */ -#define NE16_NULL ((void *)0) - -#define NE16_STATUS_FULL (0x101) - -#endif // __NE16_DEFS_H__ diff --git a/ne16/hal/ne16_hal.h b/ne16/hal/ne16_hal.h deleted file mode 100644 index 1bc460f..0000000 --- a/ne16/hal/ne16_hal.h +++ /dev/null @@ -1,197 +0,0 @@ -/* - * Luka Macan - * - * Copyright 2023 ETH Zurich and University of Bologna - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - * - * SPDX-License-Identifier: Apache-2.0 - */ - -#ifndef __NE16_HAL_H__ -#define __NE16_HAL_H__ - -#include "ne16_defs.h" -#include - -#define NE16_WRITE(offset, value) \ - *(int volatile *)(NE16_BASE_ADDR + (offset)) = (value) -#define NE16_READ(offset) *(int volatile *)(NE16_BASE_ADDR + (offset)) - -#define NE16_WRITE_IO_REG(offset, value) \ - NE16_WRITE(NE16_REGISTER_OFFSET + (offset), (value)) -#define NE16_READ_IO_REG(offset) NE16_READ(NE16_REGISTER_OFFSET + (offset)) - -#define NE16_FLAG_USED (1) -#define NE16_FLAG_UNUSED (0) - -typedef enum nnx_weight_offset_mode_e { - weightOffsetModeSymmetric = NE16_FLAG_WEIGHT_OFFSET_SYMMETRIC, - weightOffsetModeLayerWise = NE16_FLAG_WEIGHT_OFFSET_LAYER_WISE -} nnx_weight_offset_mode_e; - -typedef enum { - normMode8Bit = NE16_NORM_MODE_8BIT, - normMode16Bit = NE16_NORM_MODE_16BIT, - normMode32Bit = NE16_NORM_MODE_32BIT -} nnx_norm_mode_e; - -typedef struct nnx_norm_t { - nnx_norm_mode_e mode; - int flag_bias; - int flag_shift; -} nnx_norm_t; - -typedef enum nnx_quant_mode_e { - quantMode8Bit = NE16_QUANT_MODE_8BIT, - quantMode16Bit = NE16_QUANT_MODE_16BIT, - quantMode32Bit = NE16_QUANT_MODE_32BIT -} nnx_quant_mode_e; - -typedef enum nnx_quant_function_e { - quantFunctionIdentity = NE16_FLAG_QUANT_FUNCTION_IDENTITY, - quantFunctionRelu = NE16_FLAG_QUANT_FUNCTION_RELU -} nnx_quant_function_e; - -typedef struct nnx_quant_t { - // Shift amount must be in range 0x00-0x1F - unsigned shift_amount; - nnx_quant_mode_e mode; - nnx_quant_function_e function; - int flag_rounding; -} nnx_quant_t; - -typedef struct nnx_stride_t { - uint32_t d0; - uint32_t d1; - uint32_t d2; -} nnx_stride_t; - -typedef struct nnx_subtile_remainder_t { - uint32_t KoKi; - uint32_t HoWo; - uint32_t HiWi; -} nnx_subtile_remainder_t; - -typedef struct nnx_subtile_number_t { - uint32_t KoKi; - uint32_t HoWo; -} nnx_subtile_number_t; - -typedef struct nnx_subtile_t { - nnx_subtile_remainder_t remainder; - nnx_subtile_number_t number; -} nnx_subtile_t; - -typedef struct nnx_cfg_t { - nnx_stride_t input_stride; - nnx_stride_t output_stride; - nnx_stride_t weights_stride; - nnx_subtile_t subtile; - uint32_t padding; - uint32_t weight_offset_factor; - uint32_t filter_mask; - uint32_t conf0; -} nnx_cfg_t; - -typedef struct nnx_task_data_t { - uint32_t weights_ptr; - uint32_t infeat_ptr; - uint32_t outfeat_ptr; - uint32_t scale_ptr; - uint32_t scale_shift_ptr; - uint32_t scale_bias_ptr; - nnx_cfg_t cfg; -} nnx_task_data_t; - -typedef struct nnx_task_t { - nnx_task_data_t data; - uint8_t outbytes; - uint8_t weight_d0_stride; - uint8_t qw; - uint8_t stride_shift; - uint8_t output_channel_throughput; - uint8_t kernel_shape; - uint8_t depthwise; - uint8_t id; -} nnx_task_t; - -void ne16_cg_enable(); -void ne16_cg_disable(); - -/** - * ne16_setpriority_ne16 - * - * Set HCI interconnect bus priority to prioritize NE16. - */ -void ne16_setpriority_ne16(); - -/** - * ne16_setpriority_core - * - * Set HCI bus priority to prioritize cores. - */ -void ne16_setpriority_core(); - -/** - * ne16_reset_maxstall - * - * Reset the HCI bus maxstall parameter. - * TODO: Check if it disables it also or just resets? - */ -void ne16_reset_max_stall(); - -/** - * ne16_set_maxstall - * - * Set the HCI bus maxstall. Maxstall defines how many cycles - * will the HCI bus stall the lower priority master, i.e. ne16 or core, - * before letting it do a transaction. - */ -void ne16_set_max_stall(uint32_t max_stall); -void ne16_soft_clear(); -int ne16_empty(); -int ne16_full(); -uint8_t ne16_last_task_id(); -void ne16_event_wait(); -uint8_t ne16_acquire(); -void ne16_run_async(); -void ne16_commit(); -uint32_t ne16_get_tile_padding(uint32_t padding, uint32_t i_height, - uint32_t i_width, uint32_t n_height, - uint32_t n_width); - -void ne16_task_init(nnx_task_t *task, const uint8_t kernel_shape, - const uint8_t depthwise, const uint8_t input_bits, - const uint8_t output_bits, const uint8_t weights_bits, - const nnx_weight_offset_mode_e weights_offset_mode, - const uint32_t weights_offset_factor, nnx_quant_t quant, - nnx_norm_t norm, const uint8_t stride); -void ne16_task_set_strides(nnx_task_t *task, const uint32_t k_in, - const uint32_t w_in_stride, - const uint32_t k_in_stride, - const uint32_t w_out_stride, - const uint32_t k_out_stride); -void ne16_task_set_counters(nnx_task_t *task, const uint32_t k_in, - const uint32_t h_out, const uint32_t w_out, - const uint32_t k_out, const uint8_t padding_bottom, - const uint8_t padding_right); -void ne16_task_set_padding(nnx_task_t *task, const uint8_t top, - const uint8_t bottom, const uint8_t left, - const uint8_t right, const uint8_t value); -void ne16_task_set_mask_filter(nnx_task_t *task, const uint8_t top, - const uint8_t right, const uint8_t bottom, - const uint8_t left); -void ne16_task_offload(nnx_task_t *task); - -#endif // __NE16_HAL_H__ diff --git a/ne16/hal/ne16_hal.c b/ne16/hal/ne16_task.c similarity index 61% rename from ne16/hal/ne16_hal.c rename to ne16/hal/ne16_task.c index 42b076c..0ba54d5 100644 --- a/ne16/hal/ne16_hal.c +++ b/ne16/hal/ne16_task.c @@ -18,63 +18,10 @@ * SPDX-License-Identifier: Apache-2.0 */ -#include - -#include "ne16_defs.h" -#include "ne16_hal.h" -#include "pmsis.h" +#include "ne16_task.h" +#include "ne16_task_defs.h" #include "pulp_nnx_util.h" -inline void ne16_cg_enable() { - *(volatile uint32_t *)CLUSTER_CTRL_HWPE_ADDR |= CLUSTER_CTRL_HWPE_MASK_CG_EN; -} - -inline void ne16_cg_disable() { - *(volatile uint32_t *)CLUSTER_CTRL_HWPE_ADDR &= ~CLUSTER_CTRL_HWPE_MASK_CG_EN; -} - -inline void ne16_setpriority_ne16() { - *(volatile uint32_t *)CLUSTER_CTRL_HWPE_ADDR |= - CLUSTER_CTRL_HWPE_MASK_HCI_PRIO; -} - -inline void ne16_setpriority_core() { - *(volatile uint32_t *)CLUSTER_CTRL_HWPE_ADDR &= - ~CLUSTER_CTRL_HWPE_MASK_HCI_PRIO; -} - -inline void ne16_reset_max_stall() { - *(volatile uint32_t *)CLUSTER_CTRL_HWPE_ADDR &= - ~CLUSTER_CTRL_HWPE_MASK_HCI_MAXSTALL; -} - -inline void ne16_set_max_stall(uint32_t max_stall) { - *(volatile uint32_t *)CLUSTER_CTRL_HWPE_ADDR |= - max_stall & CLUSTER_CTRL_HWPE_MASK_HCI_MAXSTALL; -} - -inline void ne16_soft_clear() { - NE16_WRITE(NE16_SOFT_CLEAR, 0); - for (volatile int i = 0; i < 10; i++) - ; -} - -inline int ne16_empty() { return NE16_READ(NE16_STATUS) == 0; } - -inline int ne16_full() { return NE16_READ(NE16_STATUS) == NE16_STATUS_FULL; } - -inline uint8_t ne16_last_task_id() { return NE16_READ(NE16_RUNNING_JOB); } - -inline void ne16_event_wait() { eu_evt_maskWaitAndClr(NE16_EVT0); } - -inline uint8_t ne16_acquire() { return NE16_READ(NE16_ACQUIRE); } - -inline void ne16_run_async() { NE16_WRITE(NE16_TRIGGER, 0); } - -inline void ne16_commit() { - NE16_WRITE(NE16_TRIGGER, 1); // commit, no trigger -} - inline uint32_t ne16_get_tile_padding(uint32_t padding, uint32_t i_height, uint32_t i_width, uint32_t n_height, uint32_t n_width) { @@ -94,16 +41,16 @@ inline uint32_t ne16_get_tile_padding(uint32_t padding, uint32_t i_height, return tile_padding; } -void ne16_task_init(nnx_task_t *task, const uint8_t kernel_shape, +void ne16_task_init(ne16_task_t *task, const uint8_t kernel_shape, const uint8_t depthwise, const uint8_t input_bits, const uint8_t output_bits, const uint8_t weights_bits, - const nnx_weight_offset_mode_e weights_offset_mode, - const uint32_t weights_offset_factor, nnx_quant_t quant, - nnx_norm_t norm, const uint8_t stride) { + const ne16_weight_offset_mode_e weights_offset_mode, + const uint32_t weights_offset_factor, ne16_quant_t quant, + ne16_norm_t norm, const uint8_t stride) { const uint32_t flag_mode16 = input_bits == 16 ? NE16_FLAG_MODE16 : NE16_FLAG_MODE_BASIC; - *task = (nnx_task_t){ + *task = (ne16_task_t){ .outbytes = output_bits / 8, .weight_d0_stride = flag_mode16 ? NE16_WEIGHT_D0_STRIDE_MODE16 : NE16_WEIGHT_D0_STRIDE_MODE8, @@ -131,14 +78,42 @@ void ne16_task_init(nnx_task_t *task, const uint8_t kernel_shape, task->data.cfg.weight_offset_factor = weights_offset_factor; } -void ne16_task_set_strides(nnx_task_t *task, const uint32_t k_in, +/** ne16_pad_ptr + * + * Calculate the pointer to the start of the ptr as if + * it was the start to the padded data. + * Necessary for input pointer when it's padded. + */ +inline uint32_t ne16_pad_ptr(uint32_t ptr, const uint32_t width, + const uint32_t channel, const uint8_t bits, + const uint8_t padding_top, + const uint8_t padding_left) { + return ptr - (padding_top * width + padding_left) * channel * bits / 8; +} + +inline void ne16_task_set_ptrs(ne16_task_t *task, uint32_t input_ptr, + uint32_t w_in, uint32_t k_in, uint8_t bits_in, + uint8_t padding_top, uint8_t padding_left, + uint32_t output_ptr, uint32_t weights_ptr, + uint32_t scale_ptr, uint32_t shift_ptr, + uint32_t bias_ptr) { + task->data.infeat_ptr = + ne16_pad_ptr(input_ptr, w_in, k_in, bits_in, padding_top, padding_left); + task->data.outfeat_ptr = output_ptr; + task->data.weights_ptr = weights_ptr; + task->data.scale_ptr = scale_ptr; + task->data.scale_shift_ptr = shift_ptr; + task->data.scale_bias_ptr = bias_ptr; +} + +void ne16_task_set_strides(ne16_task_t *task, const uint32_t k_in, const uint32_t w_in_stride, const uint32_t k_in_stride, const uint32_t w_out_stride, const uint32_t k_out_stride) { const uint32_t num_k_in = divnceil(k_in, NE16_INPUT_CHANNEL_THROUGHPUT); - const nnx_stride_t input_stride = { + const ne16_stride_t input_stride = { .d0 = k_in_stride, .d1 = k_in_stride * w_in_stride, .d2 = task->depthwise ? 0 @@ -147,7 +122,7 @@ void ne16_task_set_strides(nnx_task_t *task, const uint32_t k_in, task->data.cfg.input_stride = input_stride; // WARNING: Stride works only for even output channel sizes (divisible by 2) - const nnx_stride_t output_stride = { + const ne16_stride_t output_stride = { .d0 = 32, .d1 = (k_out_stride * task->outbytes) >> task->stride_shift, .d2 = @@ -174,7 +149,7 @@ void ne16_task_set_strides(nnx_task_t *task, const uint32_t k_in, } } -void ne16_task_set_counters(nnx_task_t *task, const uint32_t k_in, +void ne16_task_set_counters(ne16_task_t *task, const uint32_t k_in, const uint32_t h_out, const uint32_t w_out, const uint32_t k_out, const uint8_t padding_bottom, const uint8_t padding_right) { @@ -192,7 +167,7 @@ void ne16_task_set_counters(nnx_task_t *task, const uint32_t k_in, const uint16_t rem_Wi = (task->kernel_shape == 1 ? rem_Wo : rem_Wo + 2) - padding_right; - const nnx_subtile_t subtile = { + const ne16_subtile_t subtile = { .number = {.KoKi = concat_half(num_Ko, num_Ki), .HoWo = concat_half(num_Ho, num_Wo)}, .remainder = {.KoKi = concat_half(rem_Ko, rem_Ki), @@ -201,7 +176,7 @@ void ne16_task_set_counters(nnx_task_t *task, const uint32_t k_in, task->data.cfg.subtile = subtile; } -inline void ne16_task_set_padding(nnx_task_t *task, const uint8_t top, +inline void ne16_task_set_padding(ne16_task_t *task, const uint8_t top, const uint8_t bottom, const uint8_t left, const uint8_t right, const uint8_t value) { task->data.cfg.padding = ((top & 0xf) << 28) | ((right & 0xf) << 24) | @@ -209,16 +184,49 @@ inline void ne16_task_set_padding(nnx_task_t *task, const uint8_t top, (value & 0xff); } -inline void ne16_task_set_mask_filter(nnx_task_t *task, const uint8_t top, +inline void ne16_task_set_mask_filter(ne16_task_t *task, const uint8_t top, const uint8_t right, const uint8_t bottom, const uint8_t left) { task->data.cfg.filter_mask = ((top & 0xff) << 24) | ((right & 0xff) << 16) | ((bottom & 0xff) << 8) | ((left & 0xff) << 0); } -inline void ne16_task_offload(nnx_task_t *task) { - uint32_t *task_data = (uint32_t *)&task->data; - for (int i = 0; i < sizeof(nnx_task_data_t) / 4; ++i) { - NE16_WRITE_IO_REG(i * 4, task_data[i]); - } +void ne16_task_set_dims(ne16_task_t *task, const uint32_t w_in, + const uint32_t k_in, const uint32_t w_in_stride, + const uint32_t k_in_stride, const uint32_t h_out, + const uint32_t w_out, const uint32_t k_out, + const uint32_t w_out_stride, const uint32_t k_out_stride, + const uint8_t padding_top, const uint8_t padding_bottom, + const uint8_t padding_right, + const uint8_t padding_left) { + ne16_task_set_strides(task, k_in, w_in_stride, k_in_stride, w_out_stride, + k_out_stride); + ne16_task_set_counters(task, k_in, h_out, w_out, k_out, padding_bottom, + padding_right); + ne16_task_set_padding(task, padding_top, padding_bottom, padding_left, + padding_right, 0); +} + +void ne16_task_set_dims_stride2x2( + ne16_task_t *task, const uint32_t h_in, const uint32_t w_in, + const uint32_t k_in, const uint32_t w_in_stride, const uint32_t k_in_stride, + const uint32_t h_out, const uint32_t w_out, const uint32_t k_out, + const uint32_t w_out_stride, const uint32_t k_out_stride, + const uint8_t h_ker, const uint8_t w_ker, const uint8_t padding_top, + const uint8_t padding_bottom, const uint8_t padding_right, + const uint8_t padding_left) { + const uint8_t stride = 2; + + ne16_task_set_strides(task, k_in, w_in_stride, k_in_stride, w_out_stride, + k_out_stride); + ne16_task_set_counters(task, k_in, h_out > 1 ? 3 : 1, w_out > 1 ? 3 : 1, + k_out, h_in + padding_top >= 5 ? 0 : padding_bottom, 0); + + const uint8_t padding_bottom_new = + (h_in + padding_top - h_ker) % stride == 0 ? 0 : padding_bottom; + const uint8_t padding_right_new = + (w_in + padding_left - w_ker) % stride == 0 ? 0 : padding_right; + + ne16_task_set_padding(task, padding_top, padding_bottom_new, padding_left, + padding_right_new, 0); } diff --git a/ne16/hal/ne16_task.h b/ne16/hal/ne16_task.h new file mode 100644 index 0000000..df16b6c --- /dev/null +++ b/ne16/hal/ne16_task.h @@ -0,0 +1,172 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __NE16_TASK_H__ +#define __NE16_TASK_H__ + +#include "ne16_task_defs.h" +#include + +typedef enum ne16_task_flag_e { + ne16TaskFlagFalse = 0, + ne16TaskFlagTrue = 1 +} ne16_task_flag_e; + +typedef enum ne16_weight_offset_mode_e { + weightOffsetModeSymmetric = NE16_FLAG_WEIGHT_OFFSET_SYMMETRIC, + weightOffsetModeLayerWise = NE16_FLAG_WEIGHT_OFFSET_LAYER_WISE +} ne16_weight_offset_mode_e; + +typedef enum { + normMode8Bit = NE16_NORM_MODE_8BIT, + normMode16Bit = NE16_NORM_MODE_16BIT, + normMode32Bit = NE16_NORM_MODE_32BIT +} ne16_norm_mode_e; + +typedef struct ne16_norm_t { + ne16_norm_mode_e mode; + int flag_bias; + int flag_shift; +} ne16_norm_t; + +typedef enum ne16_quant_mode_e { + quantMode8Bit = NE16_QUANT_MODE_8BIT, + quantMode16Bit = NE16_QUANT_MODE_16BIT, + quantMode32Bit = NE16_QUANT_MODE_32BIT +} ne16_quant_mode_e; + +typedef enum ne16_quant_function_e { + quantFunctionIdentity = NE16_FLAG_QUANT_FUNCTION_IDENTITY, + quantFunctionRelu = NE16_FLAG_QUANT_FUNCTION_RELU +} ne16_quant_function_e; + +typedef struct ne16_quant_t { + // Shift amount must be in range 0x00-0x1F + unsigned shift_amount; + ne16_quant_mode_e mode; + ne16_quant_function_e function; + int flag_rounding; +} ne16_quant_t; + +typedef struct ne16_stride_t { + uint32_t d0; + uint32_t d1; + uint32_t d2; +} ne16_stride_t; + +typedef struct ne16_subtile_remainder_t { + uint32_t KoKi; + uint32_t HoWo; + uint32_t HiWi; +} ne16_subtile_remainder_t; + +typedef struct ne16_subtile_number_t { + uint32_t KoKi; + uint32_t HoWo; +} ne16_subtile_number_t; + +typedef struct ne16_subtile_t { + ne16_subtile_remainder_t remainder; + ne16_subtile_number_t number; +} ne16_subtile_t; + +typedef struct ne16_cfg_t { + ne16_stride_t input_stride; + ne16_stride_t output_stride; + ne16_stride_t weights_stride; + ne16_subtile_t subtile; + uint32_t padding; + uint32_t weight_offset_factor; + uint32_t filter_mask; + uint32_t conf0; +} ne16_cfg_t; + +typedef struct ne16_task_data_t { + uint32_t weights_ptr; + uint32_t infeat_ptr; + uint32_t outfeat_ptr; + uint32_t scale_ptr; + uint32_t scale_shift_ptr; + uint32_t scale_bias_ptr; + ne16_cfg_t cfg; +} ne16_task_data_t; + +typedef struct ne16_task_t { + ne16_task_data_t data; + uint8_t outbytes; + uint8_t weight_d0_stride; + uint8_t qw; + uint8_t stride_shift; + uint8_t output_channel_throughput; + uint8_t kernel_shape; + uint8_t depthwise; + uint8_t id; +} ne16_task_t; + +void ne16_task_init(ne16_task_t *task, const uint8_t kernel_shape, + const uint8_t depthwise, const uint8_t input_bits, + const uint8_t output_bits, const uint8_t weights_bits, + const ne16_weight_offset_mode_e weights_offset_mode, + const uint32_t weights_offset_factor, ne16_quant_t quant, + ne16_norm_t norm, const uint8_t stride); +uint32_t ne16_get_tile_padding(uint32_t padding, uint32_t i_height, + uint32_t i_width, uint32_t n_height, + uint32_t n_width); +uint32_t ne16_pad_ptr(uint32_t ptr, const uint32_t width, + const uint32_t channel, const uint8_t bits, + const uint8_t padding_top, const uint8_t padding_left); +void ne16_task_set_ptrs(ne16_task_t *task, uint32_t input_ptr, uint32_t w_in, + uint32_t k_in, uint8_t bits_in, uint8_t padding_top, + uint8_t padding_left, uint32_t output_ptr, + uint32_t weights_ptr, uint32_t scale_ptr, + uint32_t shift_ptr, uint32_t bias_ptr); +void ne16_task_set_strides(ne16_task_t *task, const uint32_t k_in, + const uint32_t w_in_stride, + const uint32_t k_in_stride, + const uint32_t w_out_stride, + const uint32_t k_out_stride); +void ne16_task_set_counters(ne16_task_t *task, const uint32_t k_in, + const uint32_t h_out, const uint32_t w_out, + const uint32_t k_out, const uint8_t padding_bottom, + const uint8_t padding_right); +void ne16_task_set_padding(ne16_task_t *task, const uint8_t top, + const uint8_t bottom, const uint8_t left, + const uint8_t right, const uint8_t value); +void ne16_task_set_mask_filter(ne16_task_t *task, const uint8_t top, + const uint8_t right, const uint8_t bottom, + const uint8_t left); +void ne16_task_set_dims(ne16_task_t *task, const uint32_t w_in, + const uint32_t k_in, const uint32_t w_in_stride, + const uint32_t k_in_stride, const uint32_t h_out, + const uint32_t w_out, const uint32_t k_out, + const uint32_t w_out_stride, const uint32_t k_out_stride, + const uint8_t padding_top, const uint8_t padding_bottom, + const uint8_t padding_right, + const uint8_t padding_left); +void ne16_task_set_dims_stride2x2( + ne16_task_t *task, const uint32_t h_in, const uint32_t w_in, + const uint32_t k_in, const uint32_t w_in_stride, const uint32_t k_in_stride, + const uint32_t h_out, const uint32_t w_out, const uint32_t k_out, + const uint32_t w_out_stride, const uint32_t k_out_stride, + const uint8_t h_ker, const uint8_t w_ker, const uint8_t padding_top, + const uint8_t padding_bottom, const uint8_t padding_right, + const uint8_t padding_left); + +#endif // !__NE16_TASK_H__ diff --git a/ne16/hal/ne16_task_defs.h b/ne16/hal/ne16_task_defs.h new file mode 100644 index 0000000..803e30e --- /dev/null +++ b/ne16/hal/ne16_task_defs.h @@ -0,0 +1,107 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __NE16_DEFS_H__ +#define __NE16_DEFS_H__ + +/* ARHITECTURE */ + +#define NE16_FILTER_SIZE (3) +#define NE16_FILTER_BUFFER_SIZE (5) +#define NE16_INPUT_CHANNEL_THROUGHPUT (16) +#define NE16_OUTPUT_CHANNEL_THROUGHPUT (32) + +#define NE16_WEIGHT_D0_STRIDE_MODE8 (2) +#define NE16_WEIGHT_D0_STRIDE_MODE16 (1) + +/* TASK REGISTERS */ + +// job configuration +#define NE16_REG_WEIGHTS_PTR 0 +#define NE16_REG_INFEAT_PTR 1 +#define NE16_REG_OUTFEAT_PTR 2 +#define NE16_REG_SCALE_PTR 3 +#define NE16_REG_SCALE_SHIFT_PTR 4 +#define NE16_REG_SCALE_BIAS_PTR 5 +#define NE16_REG_INFEAT_D0_STRIDE 6 +#define NE16_REG_INFEAT_D1_STRIDE 7 +#define NE16_REG_INFEAT_D2_STRIDE 8 +#define NE16_REG_OUTFEAT_D0_STRIDE 9 +#define NE16_REG_OUTFEAT_D1_STRIDE 10 +#define NE16_REG_OUTFEAT_D2_STRIDE 11 +#define NE16_REG_WEIGHTS_D0_STRIDE 12 +#define NE16_REG_WEIGHTS_D1_STRIDE 13 +#define NE16_REG_WEIGHTS_D2_STRIDE 14 +#define NE16_REG_SUBTILE_REMAINDER_0 15 +#define NE16_REG_SUBTILE_REMAINDER_1 16 +#define NE16_REG_SUBTILE_REMAINDER_2 17 +#define NE16_REG_SUBTILE_NUMBER_0 18 +#define NE16_REG_SUBTILE_NUMBER_1 19 +#define NE16_REG_PADDING 20 +#define NE16_REG_WEIGHT_OFFSET_FACTOR 21 +#define NE16_REG_FILTER_MASKING 22 +#define NE16_REG_CONF0 23 + +/* SHIFT */ + +#define NE16_SHIFT_FLAG_NORM_BIAS (25) +#define NE16_SHIFT_FLAG_NORM_SHIFT (24) +#define NE16_SHIFT_ROUNDING (11) + +/* CONF0 FLAGS */ + +#define NE16_FLAG_NORM_BIAS (1 << 25) +#define NE16_FLAG_NORM_SHIFT (1 << 24) +#define NE16_FLAG_QUANT_FUNCTION_IDENTITY (1 << 23) +#define NE16_FLAG_QUANT_FUNCTION_RELU (0 << 23) +#define NE16_QUANT_MODE_8BIT (0 << 21) +#define NE16_QUANT_MODE_16BIT (1 << 21) +#define NE16_QUANT_MODE_32BIT (2 << 21) +// conf0[20:16] - quantization shift amount +#define NE16_FLAG_WEIGHT_OFFSET_SYMMETRIC (0 << 15) +#define NE16_FLAG_WEIGHT_OFFSET_LAYER_WISE (1 << 15) +#define NE16_FLAG_STREAMIN (1 << 14) +#define NE16_NORM_MODE_8BIT (0 << 12) +#define NE16_NORM_MODE_16BIT (1 << 12) +#define NE16_NORM_MODE_32BIT (2 << 12) +#define NE16_FLAG_ROUND (1 << 11) +#define NE16_FLAG_STRIDE_2x2 (1 << 8) +#define NE16_FLAG_LINEAR_MODE (1 << 7) +#define NE16_FLAG_MODE_3x3 (0 << 5) +#define NE16_FLAG_MODE_3x3_DW (1 << 5) +#define NE16_FLAG_MODE_1x1 (2 << 5) +#define NE16_FLAG_NORM_QUANT (1 << 4) +#define NE16_FLAG_MODE_BASIC (0 << 3) +#define NE16_FLAG_MODE16 (1 << 3) + +/* Masks */ + +#define NE16_MASK_QUANT_FUNCTION (1 << 23) +#define NE16_MASK_QUANT_MODE (3 << 21) + +/* PADDING */ + +#define NE16_DONT_PAD (0) +#define NE16_MAX_PAD (2) + +/* NORM */ +#define NE16_NORM_MAX_LEN (32) + +#endif // __NE16_DEFS_H__ diff --git a/src/pulp_nnx_ne16.c b/src/pulp_nnx_ne16.c index 80752e8..7ab0e99 100644 --- a/src/pulp_nnx_ne16.c +++ b/src/pulp_nnx_ne16.c @@ -18,155 +18,63 @@ * SPDX-License-Identifier: Apache-2.0 */ -#include "ne16_hal.h" -#include "pmsis.h" -#include "pulp_nnx.h" +#include "pulp_nnx_ne16.h" +#include "hwpe.h" +#include "ne16.h" #include "pulp_nnx_util.h" +#include #include +#include -inline void nnx_init(uint32_t max_stall) { - ne16_cg_enable(); - ne16_setpriority_ne16(); - ne16_set_max_stall(max_stall); - ne16_soft_clear(); +void ne16_nnx_init(ne16_dev_t *dev, ne16_pulp_conf_t *conf) { + ne16_pulp_open(conf); + hwpe_soft_clear(&dev->hwpe_dev); } -inline void nnx_term() { - ne16_soft_clear(); - ne16_setpriority_core(); - ne16_reset_max_stall(); - ne16_cg_disable(); +void ne16_nnx_term(ne16_dev_t *dev) { + hwpe_soft_clear(&dev->hwpe_dev); + ne16_pulp_close(); } -/** nnx_dispatch_check - * - * Check whether you can dispatch to the accelerator. - */ -inline int nnx_dispatch_check() { return !ne16_full(); } +int ne16_nnx_dispatch_check(ne16_dev_t *dev) { + return !ne16_task_queue_full(dev); +} -/** nnx_dispatch_check_blocking - * - * Block until you can dispatch to the accelerator. - */ -inline void nnx_dispatch_check_blocking() { - while (!nnx_dispatch_check()) { - ne16_event_wait(); +void ne16_nnx_dispatch_wait(ne16_dev_t *dev) { + while (!ne16_nnx_dispatch_check(dev)) { + ne16_pulp_event_wait_and_clear(); } } -/** nnx_dispatch_task - * - * Dispatch a task to the accelerator, assuming it - * was checked before. - */ -inline void nnx_dispatch_task(nnx_task_t *task) { - task->id = ne16_acquire(); - ne16_task_offload(task); - ne16_run_async(); +int ne16_nnx_dispatch(ne16_dev_t *dev, ne16_task_t *task) { + if (hwpe_task_queue_acquire_task(&dev->hwpe_dev, &task->id)) { + return 1; + } + hwpe_task_queue_write_task(&dev->hwpe_dev, (uint32_t *)&task->data, + (int)(sizeof(ne16_task_data_t) / 4)); + hwpe_task_queue_release_and_run(&dev->hwpe_dev); + return 0; } -/** nnx_resolve_check - * - * Check whether the task has been resolved. - */ -inline int nnx_resolve_check(nnx_task_t *task) { +int ne16_nnx_resolve_check(ne16_dev_t *dev, ne16_task_t *task) { +#if __PLATFORM__ == ARCHI_PLATFORM_GVSOC + // GVSOC model has a broken running_id so resolve_check + // conservativly looks if the task queue is empty. + return ne16_task_queue_empty(dev); +#else uint8_t prev_task_id = task->id - 1; - return !(ne16_last_task_id() == prev_task_id || - (ne16_last_task_id() == task->id && !ne16_empty())); + return !(hwpe_last_task_id(&dev->hwpe_dev) == prev_task_id || + (hwpe_last_task_id(&dev->hwpe_dev) == task->id && + !ne16_task_queue_empty(dev))); +#endif } -/** nnx_resolve_check_blocking - * - * Block until you can resolve the task. - */ -inline void nnx_resolve_check_blocking(nnx_task_t *task) { - while (!nnx_resolve_check(task)) { - ne16_event_wait(); +void ne16_nnx_resolve_wait(ne16_dev_t *dev, ne16_task_t *task) { + while (!ne16_nnx_resolve_check(dev, task)) { + ne16_pulp_event_wait_and_clear(); } } -inline void nnx_task_init(nnx_task_t *task, const uint8_t kernel_shape, - const uint8_t depthwise, const uint8_t input_bits, - const uint8_t output_bits, const uint8_t weights_bits, - nnx_weight_offset_mode_e weights_offset_mode, - const uint32_t weights_offset_factor, - nnx_quant_t quant, nnx_norm_t norm, - const uint8_t stride) { - - ne16_task_init(task, kernel_shape, depthwise, input_bits, output_bits, - weights_bits, weights_offset_mode, weights_offset_factor, - quant, norm, stride); -} - -/** nnx_pad_ptr - * - * Calculate the pointer to the start of the ptr as if - * it was the start to the padded data. - * Necessary for input pointer when it's padded. - */ -inline uint32_t nnx_pad_ptr(uint32_t ptr, const uint32_t width, - const uint32_t channel, const uint8_t bits, - const uint8_t padding_top, - const uint8_t padding_left) { - return ptr - (padding_top * width + padding_left) * channel * bits / 8; -} - -inline void nnx_task_set_ptrs(nnx_task_t *task, uint32_t input_ptr, - uint32_t w_in, uint32_t k_in, uint8_t bits_in, - uint8_t padding_top, uint8_t padding_left, - uint32_t output_ptr, uint32_t weights_ptr, - uint32_t scale_ptr, uint32_t shift_ptr, - uint32_t bias_ptr) { - task->data.infeat_ptr = - nnx_pad_ptr(input_ptr, w_in, k_in, bits_in, padding_top, padding_left); - task->data.outfeat_ptr = output_ptr; - task->data.weights_ptr = weights_ptr; - task->data.scale_ptr = scale_ptr; - task->data.scale_shift_ptr = shift_ptr; - task->data.scale_bias_ptr = bias_ptr; -} - -void nnx_task_set_dims(nnx_task_t *task, const uint32_t w_in, - const uint32_t k_in, const uint32_t w_in_stride, - const uint32_t k_in_stride, const uint32_t h_out, - const uint32_t w_out, const uint32_t k_out, - const uint32_t w_out_stride, const uint32_t k_out_stride, - const uint8_t padding_top, const uint8_t padding_bottom, - const uint8_t padding_right, - const uint8_t padding_left) { - ne16_task_set_strides(task, k_in, w_in_stride, k_in_stride, w_out_stride, - k_out_stride); - ne16_task_set_counters(task, k_in, h_out, w_out, k_out, padding_bottom, - padding_right); - ne16_task_set_padding(task, padding_top, padding_bottom, padding_left, - padding_right, 0); -} - -void nnx_task_set_dims_stride2x2( - nnx_task_t *task, const uint32_t h_in, const uint32_t w_in, - const uint32_t k_in, const uint32_t w_in_stride, const uint32_t k_in_stride, - const uint32_t h_out, const uint32_t w_out, const uint32_t k_out, - const uint32_t w_out_stride, const uint32_t k_out_stride, - const uint8_t h_ker, const uint8_t w_ker, const uint8_t padding_top, - const uint8_t padding_bottom, const uint8_t padding_right, - const uint8_t padding_left) { - const uint8_t stride = 2; - - ne16_task_set_strides(task, k_in, w_in_stride, k_in_stride, w_out_stride, - k_out_stride); - ne16_task_set_counters(task, k_in, h_out > 1 ? 3 : 1, w_out > 1 ? 3 : 1, - k_out, h_in + padding_top >= 5 ? 0 : padding_bottom, - 0/*w_out > 2 ? 0 : padding_right*/); - - const uint8_t padding_bottom_new = - (h_in + padding_top - h_ker) % stride == 0 ? 0 : padding_bottom; - const uint8_t padding_right_new = - (w_in + padding_left - w_ker) % stride == 0 ? 0 : padding_right; - - ne16_task_set_padding(task, padding_top, padding_bottom_new, padding_left, - padding_right_new, 0); -} - static inline uint32_t _get_tile_ptr(uint32_t ptr, int i, int j, int size_i, uint32_t size_j, uint32_t size_k, uint32_t stride_j, uint32_t stride_k, @@ -179,15 +87,8 @@ static inline uint32_t _get_tile_ptr(uint32_t ptr, int i, int j, int size_i, (j * (size_j - overlap_j) - offset_j) * stride_k * data_size / 8; } -/** nnx_dispatch_task_stride2x2 - * - * It uses NE16's 2x2 strided mode which reduces the number of writes NE16 does. - * This mode doesn't stride the NE16's subtile input pointer, so we have to - * tile the tile to the subtile's spatial dimensions (in this case 3x3 output). - * Works only if the k_out is divisible by 2. - */ -void nnx_dispatch_task_stride2x2( - nnx_task_t *task, const uint32_t w_in, const uint32_t k_in, +void ne16_nnx_dispatch_stride2x2( + ne16_dev_t *dev, ne16_task_t *task, const uint32_t w_in, const uint32_t k_in, const uint32_t w_in_stride, const uint32_t k_in_stride, const uint32_t h_out, const uint32_t w_out, const uint32_t k_out, const uint32_t w_out_stride, const uint32_t k_out_stride, @@ -221,8 +122,10 @@ void nnx_dispatch_task_stride2x2( task->data.cfg.padding = ne16_get_tile_padding(tile_padding, i, j, n_h, n_w); - nnx_dispatch_check_blocking(); - nnx_dispatch_task(task); + // Altered dispatch to wait if cannot acquire + while (ne16_nnx_dispatch(dev, task)) { + ne16_pulp_event_wait_and_clear(); + } } } } diff --git a/test/.gitignore b/test/.gitignore index 3fc5b6a..50e5358 100644 --- a/test/.gitignore +++ b/test/.gitignore @@ -2,7 +2,7 @@ BUILD __pycache__ .cache .pytest_cache -app/gen_inc +app/gen **/compile_commands.json **/*.log **/*.pt diff --git a/test/HeaderWriter.py b/test/HeaderWriter.py index eff0d01..5abb204 100644 --- a/test/HeaderWriter.py +++ b/test/HeaderWriter.py @@ -20,8 +20,11 @@ class HeaderWriter: - def __init__(self, incdir, tabwidth=4): - self.incdir = incdir + def __init__(self, gendir, tabwidth=4): + self.incdir = os.path.join(gendir, "inc") + os.makedirs(self.incdir, exist_ok=True) + self.srcdir = os.path.join(gendir, "src") + os.makedirs(self.srcdir, exist_ok=True) self.tabwidth = tabwidth def header_guard_begin(self, filename): @@ -60,7 +63,7 @@ def vector_size(self, data): def vector_declaration(self, name, size, _type): retval = "" retval += self.define(f"{name}_size", size) - retval += f"PI_L1 {_type} {name}[{name.upper()}_SIZE]" + retval += f"{_type} {name}[{name.upper()}_SIZE]" return retval def vector_initial_value(self, data, elements_per_row=10): @@ -92,8 +95,11 @@ def render_vector(self, name, size, _type, init=None, elements_per_row=10): retval += self.vector_end() return retval + def check_declaration(self, name): + return f"void check_{name}();\n\n" + def check(self, name): - return f"""static void check_{name}() {{ + return f"""void check_{name}() {{ printf("Checking the {name} vector:\\n"); int n_err = 0; @@ -126,15 +132,41 @@ def generate_header(self, name, body): file.write(filerender) def generate_vector_header(self, name, size, _type, init=None, golden=None): - bodyrender = "" - bodyrender += self.includes - bodyrender += self.render_vector(name, _type, size, init=init) + render = "" + render += self.includes + render += self.render_vector(name, "extern " + _type, size) + + if golden is not None: + render += self.render_vector("golden_" + name, "extern " + _type, size) + render += self.check_declaration(name) + + self.generate_header(name, render) + + def generate_source(self, name, body): + filename = name + ".c" + filepath = os.path.join(self.srcdir, filename) + + print(f"Generating source file -> {filepath}") + + with open(filepath, "w") as file: + file.write(body) + + def generate_vector_source(self, name, size, _type, init=None, golden=None): + render = "" + render += f'#include "{name}.h"\n\n' + render += self.render_vector(name, "PI_L1 " + _type, size, init=init) if golden is not None: - bodyrender += self.render_vector("golden_" + name, _type, size, init=golden) - bodyrender += self.check(name) + render += self.render_vector( + "golden_" + name, "PI_L1 " + _type, size, init=golden + ) + render += self.check(name) + + self.generate_source(name, render) - self.generate_header(name, bodyrender) + def generate_vector_files(self, name, size, _type, init=None, golden=None): + self.generate_vector_source(name, size, _type, init, golden) + self.generate_vector_header(name, size, _type, init, golden) def render_dims(self, name, dims): retval = "" diff --git a/test/Ne16TestClasses.py b/test/Ne16TestClasses.py index 79b5867..d99e829 100644 --- a/test/Ne16TestClasses.py +++ b/test/Ne16TestClasses.py @@ -16,7 +16,8 @@ # # SPDX-License-Identifier: Apache-2.0 -from typing import List, Union, Sequence, Optional, Set +from __future__ import annotations +from typing import List, Union, Optional, Set, Tuple import torch import numpy as np import torch.nn.functional as F @@ -101,15 +102,15 @@ def check_valid_bias_type(cls, v: Optional[IntegerType]) -> Optional[IntegerType Ne16TestConf._check_type("bias_type", v, ["int32"]) return v - @model_validator(mode="after") - def check_valid_out_channel_with_stride_2x2(self) -> "Ne16TestConf": + @model_validator(mode="after") # type: ignore + def check_valid_out_channel_with_stride_2x2(self) -> Ne16TestConf: assert implies( self.stride == Stride(height=2, width=2), self.out_channel % 2 == 0 ), f"With stride 2x2 supported only even output channel sizes. Given output channel {self.out_channel}" return self - @model_validator(mode="after") - def check_valid_depthwise(self) -> "Ne16TestConf": + @model_validator(mode="after") # type: ignore + def check_valid_depthwise(self) -> Ne16TestConf: assert implies( self.depthwise, self.kernel_shape == KernelShape(height=3, width=3) ), f"Depthwise supported only on 3x3 kernel shape. Given kernel shape {self.kernel_shape}." @@ -119,8 +120,8 @@ def check_valid_depthwise(self) -> "Ne16TestConf": ) return self - @model_validator(mode="after") - def check_valid_padding_with_kernel_shape_1x1(self) -> "Ne16TestConf": + @model_validator(mode="after") # type: ignore + def check_valid_padding_with_kernel_shape_1x1(self) -> Ne16TestConf: assert implies( self.kernel_shape == KernelShape(height=1, width=1), self.padding == Padding(top=0, bottom=0, left=0, right=0), @@ -133,16 +134,16 @@ def check_valid_has_norm_quant(cls, v: bool) -> bool: assert v == True, f"Untested without has_norm_quant." return v - @model_validator(mode="after") - def check_valid_norm_quant_types_when_has_norm_qunat(self) -> "Ne16TestConf": + @model_validator(mode="after") # type: ignore + def check_valid_norm_quant_types_when_has_norm_qunat(self) -> Ne16TestConf: if self.has_norm_quant: assert self.scale_type is not None, "Scale type was not provided." if self.has_bias: assert self.bias_type is not None, "Bias type was not provided." return self - @model_validator(mode="after") - def check_valid_out_type_with_flags(self) -> "Ne16TestConf": + @model_validator(mode="after") # type: ignore + def check_valid_out_type_with_flags(self) -> Ne16TestConf: assert implies( not self.has_norm_quant, self.out_type == Ne16.ACCUMULATOR_TYPE ), ( @@ -271,7 +272,7 @@ def _global_shift( return global_shift @staticmethod - def _random_data(_type: IntegerType, shape: Sequence[int]): + def _random_data(_type: IntegerType, shape: Tuple[int, int, int, int]): return torch.randint(_type.min, _type.max, size=shape) @staticmethod @@ -393,12 +394,11 @@ def regenerate(test: Ne16Test, regen_tensors: Set[str]) -> Ne16Test: class Ne16TestHeaderGenerator: - DEFAULT_HEADERS_DIR = "app/gen_inc" + DEFAULT_HEADERS_DIR = "app/gen" def __init__(self, headers_dir: Optional[Union[str, os.PathLike]] = None): if headers_dir is None: headers_dir = Ne16TestHeaderGenerator.DEFAULT_HEADERS_DIR - os.makedirs(headers_dir, exist_ok=True) self.header_writer = HeaderWriter(headers_dir) def generate(self, test_name: str, test: Ne16Test): @@ -409,14 +409,14 @@ def generate(self, test_name: str, test: Ne16Test): # Render input in_ctype = test.conf.in_type.ctype() in_data = test.input.permute(0, 2, 3, 1).ravel() - self.header_writer.generate_vector_header( + self.header_writer.generate_vector_files( "input", _type=in_ctype, size=in_data.numel(), init=in_data ) # Render output out_ctype = test.conf.out_type.ctype() out_data_golden = test.output.permute(0, 2, 3, 1).ravel() - self.header_writer.generate_vector_header( + self.header_writer.generate_vector_files( "output", _type=out_ctype, size=out_data_golden.numel(), @@ -436,7 +436,7 @@ def generate(self, test_name: str, test: Ne16Test): weight_type._bits, depthwise=test.conf.depthwise, ) - self.header_writer.generate_vector_header( + self.header_writer.generate_vector_files( "weight", _type="uint8_t", size=weight_init.size, init=weight_init ) @@ -444,7 +444,7 @@ def generate(self, test_name: str, test: Ne16Test): if test.scale is not None: assert test.conf.scale_type is not None scale_ctype = test.conf.scale_type.ctype() - self.header_writer.generate_vector_header( + self.header_writer.generate_vector_files( "scale", _type=scale_ctype, size=test.scale.numel(), @@ -455,7 +455,7 @@ def generate(self, test_name: str, test: Ne16Test): if test.bias is not None: assert test.conf.bias_type is not None bias_ctype = test.conf.bias_type.ctype() - self.header_writer.generate_vector_header( + self.header_writer.generate_vector_files( "bias", _type=bias_ctype, size=test.bias.numel(), init=test.bias.ravel() ) diff --git a/test/TestClasses.py b/test/TestClasses.py index c56eec2..c10641c 100644 --- a/test/TestClasses.py +++ b/test/TestClasses.py @@ -52,7 +52,7 @@ class Padding(BaseModel): class IntegerType(BaseModel): name: str - @model_validator(mode="before") + @model_validator(mode="before") # type: ignore @classmethod def model_validate_before(cls, data: Any) -> Dict: if isinstance(data, str): @@ -122,5 +122,5 @@ def model_dump( exclude_none: bool = False, round_trip: bool = False, warnings: bool = True, - ) -> str: + ) -> dict[str, Any]: ... diff --git a/test/app/Makefile b/test/app/Makefile index e5051b1..14f30fd 100644 --- a/test/app/Makefile +++ b/test/app/Makefile @@ -16,43 +16,49 @@ # # SPDX-License-Identifier: Apache-2.0 -APP := main +# Set runner_args="--trace=ne16" if you want to trace what is happening in the ne16 + +ACCELERATOR ?= ne16 +APP := main LIBDIR := $(abspath ../..) +ACC_DIR := $(LIBDIR)/$(ACCELERATOR) # Include directories ## Test -INC_DIRS += inc gen_inc +INC_DIRS += inc -## PULP-NNX -INC_DIRS += $(LIBDIR)/inc +## Library +INC_DIRS += $(LIBDIR)/inc $(LIBDIR)/util -## NE16 -INC_DIRS += $(LIBDIR)/ne16/hal $(LIBDIR)/ne16/gvsoc +## Accelerator +INC_DIRS += $(ACC_DIR)/hal $(ACC_DIR)/gvsoc $(ACC_DIR)/bsp -## Util -INC_DIRS += $(LIBDIR)/util +## Generated +INC_DIRS += gen/inc INC_FLAGS += $(addprefix -I,$(INC_DIRS)) - # Source files ## Test APP_SRCS += $(wildcard src/*.c) -## PULP-NNX -APP_SRCS += $(LIBDIR)/src/pulp_nnx_ne16.c +## Library +APP_SRCS += $(LIBDIR)/src/pulp_nnx_$(ACCELERATOR).c $(wildcard $(LIBDIR)/util/*.c) + +## Accelerator +APP_SRCS += $(wildcard $(ACC_DIR)/hal/*.c) $(wildcard $(ACC_DIR)/gvsoc/*.c) $(wildcard $(ACC_DIR)/bsp/*.c) -## NE16 -APP_SRCS += $(wildcard $(LIBDIR)/ne16/hal/*.c) $(wildcard $(LIBDIR)/ne16/gvsoc/*.c) +## Generated +APP_SRCS += $(wildcard gen/src/*.c) -## Util -APP_SRCS += $(LIBDIR)/util/pulp_nnx_util.c +# Flags -APP_CFLAGS += $(INC_FLAGS) -O2 -w +APP_CFLAGS += $(INC_FLAGS) -O2 -w -Wall -Werror -flto +APP_LDFLAGS += -flto include $(RULES_DIR)/pmsis_rules.mk diff --git a/test/app/src/main.c b/test/app/src/main.c index 1f191b3..cc67050 100644 --- a/test/app/src/main.c +++ b/test/app/src/main.c @@ -22,8 +22,9 @@ #include "layer_util.h" #include "nnx_layer.h" +#include "output.h" -void app_kickoff(void *args) { +int main() { struct pi_device cl_dev; struct pi_cluster_conf cl_conf; struct pi_cluster_task cl_task; @@ -47,7 +48,8 @@ void app_kickoff(void *args) { printf("\n"); printf("Test %s finished\n", TEST_NAME); - pmsis_exit(0); -} + printf("\n"); + check_output(); -int main() { return pmsis_kickoff((void *)app_kickoff); } + return 0; +} diff --git a/test/app/src/nnx_layer.c b/test/app/src/nnx_layer.c index fe2924f..ffd93a1 100644 --- a/test/app/src/nnx_layer.c +++ b/test/app/src/nnx_layer.c @@ -18,13 +18,13 @@ * SPDX-License-Identifier: Apache-2.0 */ -#include - #include "nnx_layer.h" -#include "pulp_nnx.h" - -#include "ne16_gvsoc_logging.h" -#include "ne16_hal.h" +#include "ne16.h" +#include "ne16_gvsoc.h" +#include "ne16_pulp_bsp.h" +#include "ne16_task.h" +#include "pulp_nnx_ne16.h" +#include // Generated headers #include "bias.h" @@ -34,65 +34,73 @@ #include "scale.h" #include "weight.h" -void execute_nnx_layer(void *unused_args) { - ne16_activate_gvsoc_logging(NE16_GVSOC_LOG_LEVEL_ALL, - NE16_GVSOC_LOGGING_FORMAT_HEXADECIMAL); - const int nnx_max_stall = 8; - nnx_init(nnx_max_stall); +static void task_prepare(ne16_task_t *task) { + ne16_task_init(task, WEIGHT_HEIGHT, GROUPS > 1, INPUT_BITS, OUTPUT_BITS, + WEIGHT_BITS, weightOffsetModeLayerWise, WEIGHT_OFFSET, + (ne16_quant_t){.shift_amount = OUTSHIFT, + .mode = quantMode8Bit, + .function = HAS_RELU ? quantFunctionRelu + : quantFunctionIdentity, + .flag_rounding = ne16TaskFlagFalse}, + (ne16_norm_t){.mode = normMode8Bit, + .flag_bias = HAS_BIAS ? ne16TaskFlagTrue + : ne16TaskFlagFalse, + .flag_shift = ne16TaskFlagFalse}, + STRIDE_HEIGHT); - nnx_task_t task; - nnx_task_init( - &task, WEIGHT_HEIGHT, GROUPS > 1, INPUT_BITS, OUTPUT_BITS, WEIGHT_BITS, - weightOffsetModeLayerWise, WEIGHT_OFFSET, - (nnx_quant_t){.shift_amount = OUTSHIFT, - .mode = quantMode8Bit, - .function = - HAS_RELU ? quantFunctionRelu : quantFunctionIdentity, - .flag_rounding = NE16_FLAG_UNUSED}, - (nnx_norm_t){.mode = normMode8Bit, - .flag_bias = HAS_BIAS ? NE16_FLAG_USED : NE16_FLAG_UNUSED, - .flag_shift = NE16_FLAG_UNUSED}, - STRIDE_HEIGHT); - - if (STRIDE_HEIGHT == 1) { - nnx_task_set_dims(&task, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH, - INPUT_CHANNEL, OUTPUT_HEIGHT, OUTPUT_WIDTH, - OUTPUT_CHANNEL, OUTPUT_WIDTH, OUTPUT_CHANNEL, PADDING_TOP, - PADDING_BOTTOM, PADDING_RIGHT, PADDING_LEFT); - } else { - nnx_task_set_dims_stride2x2( - &task, INPUT_HEIGHT, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH, + if (STRIDE_WIDTH == 2 && STRIDE_HEIGHT == 2) { + ne16_task_set_dims_stride2x2( + task, INPUT_HEIGHT, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH, INPUT_CHANNEL, OUTPUT_HEIGHT, OUTPUT_WIDTH, OUTPUT_CHANNEL, OUTPUT_WIDTH, OUTPUT_CHANNEL, WEIGHT_HEIGHT, WEIGHT_WIDTH, PADDING_TOP, PADDING_BOTTOM, PADDING_RIGHT, PADDING_LEFT); + } else { + ne16_task_set_dims(task, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH, + INPUT_CHANNEL, OUTPUT_HEIGHT, OUTPUT_WIDTH, + OUTPUT_CHANNEL, OUTPUT_WIDTH, OUTPUT_CHANNEL, PADDING_TOP, + PADDING_BOTTOM, PADDING_RIGHT, PADDING_LEFT); } - nnx_task_set_ptrs(&task, input, INPUT_WIDTH, INPUT_CHANNEL, INPUT_BITS, - PADDING_TOP, PADDING_LEFT, output, weight, scale, NULL, + ne16_task_set_ptrs(task, (uint32_t)input, INPUT_WIDTH, INPUT_CHANNEL, + INPUT_BITS, PADDING_TOP, PADDING_LEFT, (uint32_t)output, + (uint32_t)weight, (uint32_t)scale, NULL, #if HAS_BIAS == 1 - bias + (uint32_t)bias #else - NULL + NULL #endif ); +} - nnx_dispatch_check_blocking(); +static void task_execute(ne16_task_t *task) { + ne16_dev_t *dev = ne16_pulp_get_dev(); - if (STRIDE_HEIGHT == 1) { - nnx_dispatch_task(&task); - } else { - nnx_dispatch_task_stride2x2(&task, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH, + ne16_gvsoc_log_activate(dev, NE16_GVSOC_LOG_LEVEL_CONFIG, + NE16_GVSOC_LOG_FORMAT_HEXADECIMAL); + + ne16_pulp_conf_t conf = {.max_stall = 8}; + ne16_nnx_init(dev, &conf); + + ne16_nnx_dispatch_wait(dev); + + if (STRIDE_WIDTH == 2 && STRIDE_HEIGHT == 2) { + ne16_nnx_dispatch_stride2x2(dev, task, INPUT_WIDTH, INPUT_CHANNEL, INPUT_WIDTH, INPUT_CHANNEL, OUTPUT_HEIGHT, OUTPUT_WIDTH, OUTPUT_CHANNEL, OUTPUT_WIDTH, OUTPUT_CHANNEL, WEIGHT_HEIGHT, WEIGHT_WIDTH); + } else { + ne16_nnx_dispatch(dev, task); } - // nnx_resolve_check_blocking(&task); - while (!ne16_empty()) - ne16_event_wait(); - nnx_term(); - ne16_deactivate_gvsoc_logging(); + ne16_nnx_resolve_wait(dev, task); + + ne16_nnx_term(dev); + + ne16_gvsoc_log_deactivate(dev); +} - printf("\n"); - check_output(); +void execute_nnx_layer(void *args) { + ne16_task_t task; + task_prepare(&task); + task_execute(&task); } diff --git a/test/conftest.py b/test/conftest.py index 7ed485c..6c2c15b 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -49,7 +49,7 @@ def pytest_addoption(parser): "--timeout", type=int, default=120, - help="Execution timeout in seconds. Default: 120s" + help="Execution timeout in seconds. Default: 120s", ) diff --git a/test/test.py b/test/test.py index 461db19..39709b6 100644 --- a/test/test.py +++ b/test/test.py @@ -22,6 +22,7 @@ import locale import subprocess from Ne16TestClasses import Ne16Test, Ne16TestHeaderGenerator +from pathlib import Path HORIZONTAL_LINE = "\n" + "-" * 100 + "\n" @@ -99,6 +100,7 @@ def test(path: str, timeout: int): Ne16TestHeaderGenerator().generate(test_name, test) + Path("app/src/nnx_layer.c").touch() cmd = f"make -C app all run platform=gvsoc" passed, msg, stdout, stderr = execute_command(cmd=cmd, timeout=timeout) diff --git a/util/hwpe.c b/util/hwpe.c new file mode 100644 index 0000000..53c1ace --- /dev/null +++ b/util/hwpe.c @@ -0,0 +1,85 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "hwpe.h" +#include + +#define HWPE_TRIGGER 0 +#define HWPE_ACQUIRE 1 +#define HWPE_FINISHED 2 +#define HWPE_STATUS 3 +#define HWPE_RUNNING_JOB 4 +#define HWPE_SOFT_CLEAR 5 +#define HWPE_SWSYNC 6 +#define HWPE_TASK_REG_OFFSET 8 + +inline void hwpe_reg_write(hwpe_dev_t *dev, int reg, uint32_t value) { + *(dev->base_addr + reg) = value; +} + +inline uint32_t hwpe_reg_read(hwpe_dev_t *dev, int reg) { + return *(dev->base_addr + reg); +} + +inline void hwpe_task_reg_write(hwpe_dev_t *dev, int reg, uint32_t value) { + hwpe_reg_write(dev, HWPE_TASK_REG_OFFSET + reg, value); +} + +inline uint32_t hwpe_task_reg_read(hwpe_dev_t *dev, int reg) { + return hwpe_reg_read(dev, HWPE_TASK_REG_OFFSET + reg); +} + +void hwpe_soft_clear(hwpe_dev_t *dev) { + hwpe_reg_write(dev, HWPE_SOFT_CLEAR, 0); + for (volatile int i = 0; i < 10; i++) + ; +} + +uint32_t hwpe_task_queue_status(hwpe_dev_t *dev) { + return hwpe_reg_read(dev, HWPE_STATUS); +} + +int hwpe_task_queue_acquire_task(hwpe_dev_t *dev, uint8_t *id) { + uint32_t read_value = (int32_t)hwpe_reg_read(dev, HWPE_ACQUIRE); + if (read_value >= 256) { + return 1; + } else { + *id = (uint8_t)read_value; + return 0; + } +} + +void hwpe_task_queue_write_task(hwpe_dev_t *dev, uint32_t *data, int len) { + for (int i = 0; i < len; i++) { + hwpe_task_reg_write(dev, i, data[i]); + } +} + +void hwpe_task_queue_release_and_run(hwpe_dev_t *dev) { + hwpe_reg_write(dev, HWPE_TRIGGER, 0); +} + +void hwpe_task_queue_release(hwpe_dev_t *dev) { + hwpe_reg_write(dev, HWPE_TRIGGER, 1); +} + +uint8_t hwpe_last_task_id(hwpe_dev_t *dev) { + return (uint8_t)hwpe_reg_read(dev, HWPE_RUNNING_JOB); +} diff --git a/util/hwpe.h b/util/hwpe.h new file mode 100644 index 0000000..52bf912 --- /dev/null +++ b/util/hwpe.h @@ -0,0 +1,43 @@ +/* + * Luka Macan + * + * Copyright 2023 ETH Zurich and University of Bologna + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __HWPE_H__ +#define __HWPE_H__ + +#include + +/* HWPE device */ +typedef struct hwpe_dev_t { + volatile uint32_t *base_addr; +} hwpe_dev_t; + +void hwpe_reg_write(hwpe_dev_t *dev, int reg, uint32_t value); +uint32_t hwpe_reg_read(hwpe_dev_t *dev, int reg); +void hwpe_task_reg_write(hwpe_dev_t *dev, int reg, uint32_t value); +uint32_t hwpe_task_reg_read(hwpe_dev_t *dev, int reg); +void hwpe_soft_clear(hwpe_dev_t *dev); +uint32_t hwpe_task_queue_status(hwpe_dev_t *dev); +int hwpe_task_queue_acquire_task(hwpe_dev_t *dev, uint8_t *id); +void hwpe_task_queue_write_task(hwpe_dev_t *dev, uint32_t *data, int len); +void hwpe_task_queue_release_and_run(hwpe_dev_t *dev); +void hwpe_task_queue_release(hwpe_dev_t *dev); +uint8_t hwpe_last_task_id(hwpe_dev_t *dev); + +#endif // !__HWPE_H__