pulp-platform · lukamac · Feb 1, 2024 · Jan 14, 2024 · Jan 14, 2024 · Jan 14, 2024
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -20,25 +20,41 @@ stages:
   - lint
   - test
 
-format_python:
+python_format:
   stage: lint
   tags:
     - python-lint
   script:
     - black --check .
 
-static_check_python:
+python_sort_imports:
+  stage: lint
+  tags:
+    - python-lint
+  script:
+    - isort --check test
+
+python_static_check:
   stage: lint
   tags:
     - python-lint
   script:
     - pyright .
 
-run_test0:
+run_ne16_test:
   stage: test
   tags:
     - gap9-sdk
   artifacts:
     untracked: true
   script:
-    - cd test && pytest test.py --test-dir tests --recursive
+    - cd test && pytest test.py --test-dir tests --recursive -A ne16
+
+run_neureka_test:
+  stage: test
+  tags:
+    - siracusa-sdk
+  artifacts:
+    untracked: true
+  script:
+    - cd test && pytest test.py --test-dir tests --recursive -A neureka
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,25 @@
 # Changelog
 
+## [Unreleased]
+
+### Added
+
+- N-EUREKA accelerator support: 3x3, 1x1, and 3x3 depthwise convolution kernels
+- Support for kernels without normalization and quantization for NE16
+- isort check
+- publication citation
+
+### Changed
+
+- `ne16_task_init` got split into smaller parts: `ne16_task_init`, `ne16_task_set_op_to_conv`, `ne16_task_set_weight_offset`, `ne16_task_set_bits`, `ne16_task_set_norm_quant`
+- strides in `ne16_task_set_strides`, `ne16_task_set_dims`, and `ne16_task_set_ptrs` are now strides between consecutive elements in that dimension
+- `ne16_task_queue_size` is now `NE16_TASK_QUEUE_SIZE`
+
+### Removed
+
+- `k_in_stride`, `w_in_stride`, `k_out_stride`, and `w_out_stride` from `ne16_nnx_dispatch_stride2x2`
+- `mode` attribute from `ne16_quant_t` structure
+
 ## [0.3.0] - 2024-01-14
 
 ### Added

diff --git a/README.md b/README.md
@@ -39,51 +39,22 @@ _Note: The accelerator can provide additional helper functions if needed._
 
 ## Accelerators
 
-### NE16
-
-Github repo [link](https://github.com/pulp-platform/ne16).
-
-#### Implemented features
-
-- [x] Convolution w/ kernel shape 1x1
-- [x] Convolution w/ kernel shape 3x3
-- [x] Depthwise convolution w/ kernel shape 3x3
-- [x] Stride 1x1
-- [x] Stride 2x2
-- [ ] Normalization and quantization
-    - [x] With
-    - [ ] Without
-    - [x] Relu (w/ and w/o)
-    - [x] Bias (w/ and w/o)
-    - [ ] Per-channel shift
-    - [x] Per-layer shift
-    - [ ] Rounding
-- [ ] Input type
-    - [x] uint8
-    - [ ] uint16
-- [ ] Output type
-    - [x] int8
-    - [x] uint8 (only w/ Relu)
-    - [ ] int32
-    - [ ] uint32 (only w/ Relu)
-- [ ] Scale type
-    - [x] uint8
-    - [ ] uint16
-    - [ ] uint32
-- [x] Bias type
-    - [x] int32
-- [ ] Weight type
-    - [x] int8
-    - [ ] int2-7
-
-### Neureka
-
-**Untested and considered broken.**
+- [NE16](ne16/README.md)
+- [Neureka](neureka/README.md)
 
 ## Testing
 
 You can find information about testing in the dedicated [README](test/README.md).
 
+### Environment
+
+The library was tested with following pairs of SDKs and compilers:
+
+| SDK | SDK Commit Hash | Compiler | Compiler Commit Hash |
+| --- | --------------- | -------- | -------------------- |
+| gap\_sdk (obtainable from GreenWaves Technologies) | 90df4ce219 | [gap\_gnu\_toolchain](https://github.com/GreenWaves-Technologies/gap_gnu_toolchain) | 360fd4f9d6 |
+| [pulp-sdk](https://github.com/Scheremo/pulp-sdk) | c216298881 | [pulp-riscv-gnu-toolchain](https://github.com/GreenWaves-Technologies/gap_gnu_toolchain) | 9938bd8fcf (release v1.0.16) |
+
 ## Contributing
 
 Bug reports and feature requests should be reported through issues.
@@ -93,15 +64,38 @@ All the development should be done through forks and merged onto the `dev` branc
 
 The library will follow the [Semantic Versioning](https://semver.org/).
 
-## Citing
+## Publication
+
+<details>
+<summary>If you use PULP-NNX in your work, you can cite us:</summary>
+
+```
+@inproceedings{10.1145/3607889.3609092,
+    author = {Macan, Luka and Burrello, Alessio and Benini, Luca and Conti, Francesco},
+    title = {WIP: Automatic DNN Deployment on Heterogeneous Platforms: the GAP9 Case Study},
+    year = {2024},
+    isbn = {9798400702907},
+    publisher = {Association for Computing Machinery},
+    address = {New York, NY, USA},
+    url = {https://doi.org/10.1145/3607889.3609092},
+    doi = {10.1145/3607889.3609092},
+    abstract = {Emerging Artificial-Intelligence-enabled System-on-Chips (AI-SoCs) combine a flexible microcontroller with parallel Digital Signal Processors (DSP) and heterogeneous acceleration capabilities. In this Work-in-Progress paper, we focus on the GAP9 RISC-V SoC as a case study to show how the open-source DORY Deep Neural Network (DNN) tool flow can be extended for heterogeneous acceleration by fine grained interleaving of a dedicated Neural Engine and a cluster of RISC-V cores. Our results show that up to 91\% of the peak accelerator throughput can be extracted in end-to-end execution of benchmarks based on MobileNet-V1 and V2.},
+    booktitle = {Proceedings of the International Conference on Compilers, Architecture, and Synthesis for Embedded Systems},
+    pages = {9–10},
+    numpages = {2},
+    keywords = {TinyML, MCUs, deep learning, HW accelerators},
+    location = {<conf-loc>, <city>Hamburg</city>, <country>Germany</country>, </conf-loc>},
+    series = {CASES '23 Companion}
+}
+```
 
-*TBA*
+</details>
 
 ## Contributors
 
 * Luka Macan <[[email protected]](mailto:[email protected])>
 * Francesco Conti <[[email protected]](mailto:[email protected])>
-* Arpan Prasad <[[email protected]](mailto:[email protected])>
+* Arpan Suravi Prasad <[[email protected]](mailto:[email protected])>
 
 ## License
 

diff --git a/inc/pulp_nnx_ne16.h b/inc/pulp_nnx_ne16.h
@@ -43,7 +43,8 @@ void ne16_nnx_dispatch_wait(ne16_dev_t *dev);
 /** ne16_nnx_dispatch
  *
  * Dispatch a task to the accelerator.
- * Fails with return code 1 if the task cannot be dispatched. Otherwise returns 0.
+ * Fails with return code 1 if the task cannot be dispatched. Otherwise returns
+ * 0.
  */
 int ne16_nnx_dispatch(ne16_dev_t *dev, ne16_task_t *task);
 
@@ -59,7 +60,6 @@ int ne16_nnx_resolve_check(ne16_dev_t *dev, ne16_task_t *task);
  */
 void ne16_nnx_resolve_wait(ne16_dev_t *dev, ne16_task_t *task);
 
-
 /* Additional helper functions */
 
 /** ne16_nnx_dispatch_stride2x2
@@ -69,9 +69,8 @@ void ne16_nnx_resolve_wait(ne16_dev_t *dev, ne16_task_t *task);
  * tile the tile to the subtile's spatial dimensions (in this case 3x3 output).
  * Works only if the k_out is divisible by 2.
  */
-void ne16_nnx_dispatch_stride2x2(
-    ne16_dev_t *dev, ne16_task_t *task, const uint32_t w_in, const uint32_t k_in,
-    const uint32_t w_in_stride, const uint32_t k_in_stride,
-    const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
-    const uint32_t w_out_stride, const uint32_t k_out_stride,
-    const uint8_t h_ker, const uint8_t w_ker);
+void ne16_nnx_dispatch_stride2x2(ne16_dev_t *dev, ne16_task_t *task,
+                                 const uint32_t w_in, const uint32_t k_in,
+                                 const uint32_t h_out, const uint32_t w_out,
+                                 const uint32_t k_out, const uint8_t h_ker,
+                                 const uint8_t w_ker);
diff --git a/inc/pulp_nnx_neureka.h b/inc/pulp_nnx_neureka.h
@@ -0,0 +1,61 @@
+/*
+ * Luka Macan <[email protected]>
+ *
+ * Copyright 2023 ETH Zurich and University of Bologna
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "neureka.h"
+#include "neureka_siracusa_bsp.h"
+#include "neureka_task.h"
+#include <stdint.h>
+
+/* PULP-NNX interface */
+
+void neureka_nnx_init(neureka_dev_t *dev, neureka_siracusa_conf_t *conf);
+void neureka_nnx_term(neureka_dev_t *dev);
+
+/** neureka_nnx_dispatch_check
+ *
+ * Check whether you can dispatch to the accelerator.
+ */
+int neureka_nnx_dispatch_check(neureka_dev_t *dev);
+
+/** neureka_nnx_dispatch_wait
+ *
+ * Block until you can dispatch to the accelerator.
+ */
+void neureka_nnx_dispatch_wait(neureka_dev_t *dev);
+
+/** neureka_nnx_dispatch
+ *
+ * Dispatch a task to the accelerator.
+ * Fails with return code 1 if the task cannot be dispatched. Otherwise returns
+ * 0.
+ */
+int neureka_nnx_dispatch(neureka_dev_t *dev, neureka_task_t *task);
+
+/** neureka_nnx_resolve_check
+ *
+ * Check whether the task has been resolved.
+ */
+int neureka_nnx_resolve_check(neureka_dev_t *dev, neureka_task_t *task);
+
+/** neureka_nnx_resolve_wait
+ *
+ * Block until you can resolve the task.
+ */
+void neureka_nnx_resolve_wait(neureka_dev_t *dev, neureka_task_t *task);
diff --git a/ne16/README.md b/ne16/README.md
@@ -0,0 +1,36 @@
+# NE16
+
+## Docs
+
+- Github repo [link](https://github.com/pulp-platform/ne16).
+
+## Implemented features
+
+- [x] Convolution w/ kernel shape 1x1
+- [x] Convolution w/ kernel shape 3x3
+- [x] Depthwise convolution w/ kernel shape 3x3
+- [x] Stride 2x2
+- [ ] Normalization and quantization
+    - [x] With
+    - [x] Without
+    - [x] Relu (w/ and w/o)
+    - [x] Bias (w/ and w/o)
+    - [ ] Per-channel shift
+    - [x] Per-layer shift
+    - [ ] Rounding
+- [ ] Input type
+    - [x] uint8
+    - [ ] uint16
+- [ ] Output type
+    - [x] int8
+    - [x] uint8 (only w/ Relu)
+    - [x] int32
+- [ ] Scale type
+    - [x] uint8
+    - [ ] uint16
+    - [ ] uint32
+- [x] Bias type
+    - [x] int32
+- [ ] Weight type
+    - [x] int8
+    - [ ] int2-7
diff --git a/ne16/hal/ne16.c b/ne16/hal/ne16.c
@@ -23,8 +23,6 @@
 #define NE16_STATUS_EMPTY (0x000)
 #define NE16_STATUS_FULL (0x101)
 
-inline int ne16_task_queue_size(ne16_dev_t *dev) { return 2; }
-
 inline int ne16_task_queue_tasks_in_flight(ne16_dev_t *dev) {
   uint32_t status = hwpe_task_queue_status(&dev->hwpe_dev);
   return (status & 0x1) + ((status >> 8) & 0x1);

diff --git a/ne16/hal/ne16.h b/ne16/hal/ne16.h
@@ -24,11 +24,12 @@
 #include "hwpe.h"
 #include <stdint.h>
 
+#define NE16_TASK_QUEUE_SIZE (2)
+
 typedef struct ne16_dev_t {
   hwpe_dev_t hwpe_dev; /* Implements the HWPE device interface */
 } ne16_dev_t;
 
-int ne16_task_queue_size(ne16_dev_t *dev);
 int ne16_task_queue_tasks_in_flight(ne16_dev_t *dev);
 int ne16_task_queue_empty(ne16_dev_t *dev);
 int ne16_task_queue_full(ne16_dev_t *dev);