Skip to content

Commit

Permalink
Merge branch 'main' of github.com:pulp-platform/pulp-nnx into fconti/…
Browse files Browse the repository at this point in the history
…neureka
FrancescoConti committed Feb 22, 2024
2 parents bbd4b7a + b4d7cd4 commit 99cc182
Showing 15 changed files with 216 additions and 102 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -8,12 +8,14 @@
- Support for kernels without normalization and quantization for NE16
- isort check
- publication citation
- support 32bit scale

### Changed

- `ne16_task_init` got split into smaller parts: `ne16_task_init`, `ne16_task_set_op_to_conv`, `ne16_task_set_weight_offset`, `ne16_task_set_bits`, `ne16_task_set_norm_quant`
- strides in `ne16_task_set_strides`, `ne16_task_set_dims`, and `ne16_task_set_ptrs` are now strides between consecutive elements in that dimension
- `ne16_task_queue_size` is now `NE16_TASK_QUEUE_SIZE`
- `ne16_task_set_ptrs` split into `ne16_task_set_ptrs_conv` and `ne16_task_set_ptrs_norm_quant`

### Removed

2 changes: 1 addition & 1 deletion ne16/README.md
Original file line number Diff line number Diff line change
@@ -28,7 +28,7 @@
- [ ] Scale type
- [x] uint8
- [ ] uint16
- [ ] uint32
- [x] uint32
- [x] Bias type
- [x] int32
- [ ] Weight type
25 changes: 14 additions & 11 deletions ne16/hal/ne16_task.c
Original file line number Diff line number Diff line change
@@ -113,15 +113,18 @@ uint32_t ne16_pad_ptr(uint32_t ptr, const uint32_t width, uint32_t width_stride,
return ptr - (padding_top * width + padding_left) * width_stride;
}

void ne16_task_set_ptrs(ne16_task_t *task, uint32_t input_ptr, uint32_t w_in,
uint32_t w_in_stride, uint8_t padding_top,
uint8_t padding_left, uint32_t output_ptr,
uint32_t weights_ptr, uint32_t scale_ptr,
uint32_t shift_ptr, uint32_t bias_ptr) {
void ne16_task_set_ptrs_conv(ne16_task_t *task, uint32_t input_ptr,
uint32_t w_in, uint32_t w_in_stride,
uint8_t padding_top, uint8_t padding_left,
uint32_t output_ptr, uint32_t weights_ptr) {
task->data.infeat_ptr =
ne16_pad_ptr(input_ptr, w_in, w_in_stride, padding_top, padding_left);
task->data.outfeat_ptr = output_ptr;
task->data.weights_ptr = weights_ptr;
}

void ne16_task_set_ptrs_norm_quant(ne16_task_t *task, uint32_t scale_ptr,
uint32_t shift_ptr, uint32_t bias_ptr) {
task->data.scale_ptr = scale_ptr;
task->data.scale_shift_ptr = shift_ptr;
task->data.scale_bias_ptr = bias_ptr;
@@ -206,8 +209,8 @@ void ne16_task_set_padding(ne16_task_t *task, const uint8_t top,
}

void ne16_task_set_mask_filter(ne16_task_t *task, const uint8_t top,
const uint8_t right, const uint8_t bottom,
const uint8_t left) {
const uint8_t bottom, const uint8_t left,
const uint8_t right) {
task->data.cfg.filter_mask = ((top & 0xff) << 24) | ((right & 0xff) << 16) |
((bottom & 0xff) << 8) | ((left & 0xff) << 0);
}
@@ -219,8 +222,8 @@ void ne16_task_set_dims(ne16_task_t *task, const uint32_t w_in,
const uint32_t h_out_stride,
const uint32_t w_out_stride, const uint8_t padding_top,
const uint8_t padding_bottom,
const uint8_t padding_right,
const uint8_t padding_left) {
const uint8_t padding_left,
const uint8_t padding_right) {
ne16_task_set_strides(task, k_in, h_in_stride, w_in_stride, h_out_stride,
w_out_stride);
ne16_task_set_counters(task, k_in, h_out, w_out, k_out, padding_bottom,
@@ -235,8 +238,8 @@ void ne16_task_set_dims_stride2x2(
const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
const uint32_t h_out_stride, const uint32_t w_out_stride,
const uint8_t h_ker, const uint8_t w_ker, const uint8_t padding_top,
const uint8_t padding_bottom, const uint8_t padding_right,
const uint8_t padding_left) {
const uint8_t padding_bottom, const uint8_t padding_left,
const uint8_t padding_right) {
const uint8_t stride = 2;

// WARNING: works only for even output channel stride (divisible by 2)
31 changes: 16 additions & 15 deletions ne16/hal/ne16_task.h
Original file line number Diff line number Diff line change
@@ -42,8 +42,8 @@ typedef enum {

typedef struct ne16_norm_t {
ne16_norm_mode_e mode;
int flag_bias;
int flag_shift;
ne16_task_flag_e flag_bias;
ne16_task_flag_e flag_shift;
} ne16_norm_t;

typedef enum ne16_quant_mode_e {
@@ -59,9 +59,9 @@ typedef enum ne16_quant_function_e {

typedef struct ne16_quant_t {
// Shift amount must be in range 0x00-0x1F
unsigned shift_amount;
uint8_t shift_amount;
ne16_quant_function_e function;
int flag_rounding;
ne16_task_flag_e flag_rounding;
} ne16_quant_t;

typedef struct ne16_stride_t {
@@ -133,11 +133,12 @@ uint32_t ne16_get_tile_padding(uint32_t padding, uint32_t i_height,
uint32_t ne16_pad_ptr(uint32_t ptr, const uint32_t width,
const uint32_t width_stride, const uint8_t padding_top,
const uint8_t padding_left);
void ne16_task_set_ptrs(ne16_task_t *task, uint32_t input_ptr, uint32_t w_in,
uint32_t w_in_stride, uint8_t padding_top,
uint8_t padding_left, uint32_t output_ptr,
uint32_t weights_ptr, uint32_t scale_ptr,
uint32_t shift_ptr, uint32_t bias_ptr);
void ne16_task_set_ptrs_conv(ne16_task_t *task, uint32_t input_ptr,
uint32_t w_in, uint32_t w_in_stride,
uint8_t padding_top, uint8_t padding_left,
uint32_t output_ptr, uint32_t weights_ptr);
void ne16_task_set_ptrs_norm_quant(ne16_task_t *task, uint32_t scale_ptr,
uint32_t shift_ptr, uint32_t bias_ptr);
/** ne16_task_set_strides
*
* All the strides variables are strides between elements alongside that
@@ -157,8 +158,8 @@ void ne16_task_set_padding(ne16_task_t *task, const uint8_t top,
const uint8_t bottom, const uint8_t left,
const uint8_t right, const uint8_t value);
void ne16_task_set_mask_filter(ne16_task_t *task, const uint8_t top,
const uint8_t right, const uint8_t bottom,
const uint8_t left);
const uint8_t bottom, const uint8_t left,
const uint8_t right);
/** ne16_task_set_dims
*
* All the strides variables are strides between elements alongside that
@@ -172,8 +173,8 @@ void ne16_task_set_dims(ne16_task_t *task, const uint32_t w_in,
const uint32_t h_out_stride,
const uint32_t w_out_stride, const uint8_t padding_top,
const uint8_t padding_bottom,
const uint8_t padding_right,
const uint8_t padding_left);
const uint8_t padding_left,
const uint8_t padding_right);
/** ne16_task_set_dims_stride2x2
*
* All the strides variables are strides between elements alongside that
@@ -186,7 +187,7 @@ void ne16_task_set_dims_stride2x2(
const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
const uint32_t h_out_stride, const uint32_t w_out_stride,
const uint8_t h_ker, const uint8_t w_ker, const uint8_t padding_top,
const uint8_t padding_bottom, const uint8_t padding_right,
const uint8_t padding_left);
const uint8_t padding_bottom, const uint8_t padding_left,
const uint8_t padding_right);

#endif // !__NE16_TASK_H__
5 changes: 2 additions & 3 deletions neureka/README.md
Original file line number Diff line number Diff line change
@@ -16,17 +16,16 @@ Github repo [link](https://github.com/siracusa-soc/ne).
- [x] Bias (w/ and w/o)
- [ ] Per-channel shift
- [x] Per-layer shift
- [ ] Rounding
- [x] Input type
- [x] uint8
- [x] int8
- [x] Output type
- [x] int8
- [x] uint8 (only w/ Relu)
- [x] int32
- [ ] Scale type
- [x] Scale type
- [x] uint8
- [ ] uint32
- [x] uint32
- [x] Bias type
- [x] int32
- [ ] Weight type
6 changes: 3 additions & 3 deletions neureka/bsp/siracusa/neureka_siracusa_bsp.h
Original file line number Diff line number Diff line change
@@ -18,8 +18,8 @@
* SPDX-License-Identifier: Apache-2.0
*/

#ifndef __NEUREKA_siracusa_BSP_H__
#define __NEUREKA_siracusa_BSP_H__
#ifndef __NEUREKA_SIRACUSA_BSP_H__
#define __NEUREKA_SIRACUSA_BSP_H__

#include "neureka.h"
#include <stdint.h>
@@ -64,4 +64,4 @@ void neureka_siracusa_close();
void neureka_siracusa_event_wait_and_clear();
const neureka_dev_t *neureka_siracusa_get_dev();

#endif // !__NEUREKA_siracusa_BSP_H__
#endif // !__NEUREKA_SIRACUSA_BSP_H__
23 changes: 12 additions & 11 deletions neureka/hal/neureka_task.c
Original file line number Diff line number Diff line change
@@ -47,8 +47,7 @@ void neureka_task_init(neureka_task_t *task) {

void neureka_task_set_op_to_conv(neureka_task_t *task,
const uint8_t kernel_shape,
const uint8_t depthwise,
const uint8_t stride) {
const uint8_t depthwise) {
task->depthwise = depthwise;
task->kernel_shape = kernel_shape;
task->subtile_output_channel = depthwise ? NEUREKA_SUBTILE_INPUT_CHANNEL_3x3
@@ -133,16 +132,18 @@ uint32_t neureka_pad_ptr(uint32_t ptr, const uint32_t width,
return ptr - (padding_top * width + padding_left) * width_stride;
}

void neureka_task_set_ptrs(neureka_task_t *task, uint32_t input_ptr,
uint32_t w_in, uint32_t w_in_stride,
uint8_t padding_top, uint8_t padding_left,
uint32_t output_ptr, uint32_t weights_ptr,
uint32_t scale_ptr, uint32_t shift_ptr,
uint32_t bias_ptr) {
void neureka_task_set_ptrs_conv(neureka_task_t *task, uint32_t input_ptr,
uint32_t w_in, uint32_t w_in_stride,
uint8_t padding_top, uint8_t padding_left,
uint32_t output_ptr, uint32_t weights_ptr) {
task->data.infeat_ptr =
neureka_pad_ptr(input_ptr, w_in, w_in_stride, padding_top, padding_left);
task->data.outfeat_ptr = output_ptr;
task->data.weights_ptr = weights_ptr;
}

void neureka_task_set_ptrs_norm_quant(neureka_task_t *task, uint32_t scale_ptr,
uint32_t shift_ptr, uint32_t bias_ptr) {
task->data.scale_ptr = scale_ptr;
task->data.scale_shift_ptr = shift_ptr;
task->data.scale_bias_ptr = bias_ptr;
@@ -223,8 +224,8 @@ void neureka_task_set_padding(neureka_task_t *task, const uint8_t top,
}

void neureka_task_set_mask_filter(neureka_task_t *task, const uint8_t top,
const uint8_t right, const uint8_t bottom,
const uint8_t left) {
const uint8_t bottom, const uint8_t left,
const uint8_t right) {
task->data.cfg.filter_mask = ((top & 0xff) << 24) | ((right & 0xff) << 16) |
((bottom & 0xff) << 8) | ((left & 0xff) << 0);
}
@@ -235,7 +236,7 @@ void neureka_task_set_dims(
const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
const uint32_t h_out_stride, const uint32_t w_out_stride,
const uint8_t padding_top, const uint8_t padding_bottom,
const uint8_t padding_right, const uint8_t padding_left) {
const uint8_t padding_left, const uint8_t padding_right) {
neureka_task_set_strides(task, k_in, h_in_stride, w_in_stride, h_out_stride,
w_out_stride);
neureka_task_set_counters(task, k_in, h_out, w_out, k_out, padding_bottom,
28 changes: 14 additions & 14 deletions neureka/hal/neureka_task.h
Original file line number Diff line number Diff line change
@@ -51,8 +51,8 @@ typedef enum {

typedef struct neureka_norm_t {
neureka_norm_mode_e mode;
int flag_bias;
int flag_shift;
neureka_task_flag_e flag_bias;
neureka_task_flag_e flag_shift;
} neureka_norm_t;

typedef enum neureka_quant_mode_e {
@@ -67,9 +67,9 @@ typedef enum neureka_quant_function_e {

typedef struct neureka_quant_t {
// Shift amount must be in range 0x00-0x1F
unsigned shift_amount;
uint8_t shift_amount;
neureka_quant_function_e function;
int flag_rounding;
neureka_task_flag_e flag_rounding;
} neureka_quant_t;

typedef struct neureka_stride_t {
@@ -128,7 +128,7 @@ typedef struct neureka_task_t {
void neureka_task_init(neureka_task_t *task);
void neureka_task_set_op_to_conv(neureka_task_t *task,
const uint8_t kernel_shape,
const uint8_t depthwise, const uint8_t stride);
const uint8_t depthwise);
void neureka_task_set_bits(neureka_task_t *task, const uint8_t input_bits,
const uint8_t output_bits,
const uint8_t weight_bits);
@@ -147,12 +147,12 @@ uint32_t neureka_get_tile_padding(uint32_t padding, uint32_t i_height,
uint32_t neureka_pad_ptr(uint32_t ptr, const uint32_t width,
const uint32_t width_stride, const uint8_t padding_top,
const uint8_t padding_left);
void neureka_task_set_ptrs(neureka_task_t *task, uint32_t input_ptr,
uint32_t w_in, uint32_t w_in_stride,
uint8_t padding_top, uint8_t padding_left,
uint32_t output_ptr, uint32_t weights_ptr,
uint32_t scale_ptr, uint32_t shift_ptr,
uint32_t bias_ptr);
void neureka_task_set_ptrs_conv(neureka_task_t *task, uint32_t input_ptr,
uint32_t w_in, uint32_t w_in_stride,
uint8_t padding_top, uint8_t padding_left,
uint32_t output_ptr, uint32_t weights_ptr);
void neureka_task_set_ptrs_norm_quant(neureka_task_t *task, uint32_t scale_ptr,
uint32_t shift_ptr, uint32_t bias_ptr);
/** neureka_task_set_strides
*
* All the strides variables are strides between elements alongside that
@@ -173,8 +173,8 @@ void neureka_task_set_padding(neureka_task_t *task, const uint8_t top,
const uint8_t bottom, const uint8_t left,
const uint8_t right, const uint8_t value);
void neureka_task_set_mask_filter(neureka_task_t *task, const uint8_t top,
const uint8_t right, const uint8_t bottom,
const uint8_t left);
const uint8_t bottom, const uint8_t left,
const uint8_t right);
/** neureka_task_set_dims
*
* All the strides variables are strides between elements alongside that
@@ -187,6 +187,6 @@ void neureka_task_set_dims(
const uint32_t h_out, const uint32_t w_out, const uint32_t k_out,
const uint32_t h_out_stride, const uint32_t w_out_stride,
const uint8_t padding_top, const uint8_t padding_bottom,
const uint8_t padding_right, const uint8_t padding_left);
const uint8_t padding_left, const uint8_t padding_right);

#endif // !__NEUREKA_TASK_H__
17 changes: 14 additions & 3 deletions test/NeuralEngineFunctionalModel.py
Original file line number Diff line number Diff line change
@@ -28,24 +28,34 @@ def _norm_quant(
bias_type: Optional[IntegerType],
has_bias: bool,
has_relu: bool,
verbose: bool,
) -> torch.Tensor:
# Scale accumulators are in 48bit, so keeping the data in 64bit
tensor = tensor * scale
assert tensor.dtype == torch.int64

if verbose:
print("INTERMEDIATE RESULTS (after scale):")
print(tensor)

if has_bias:
assert bias is not None
assert bias_type is not None
# Saturating cast to int32

tensor = NeuralEngineFunctionalModel._cast(
tensor, bias_type, saturate=True
tensor, bias_type, saturate=False
).type(torch.int32)

tensor = tensor + bias

tensor = NeuralEngineFunctionalModel._cast(
tensor, bias_type, saturate=False
tensor, bias_type, saturate=True
).type(torch.int32)

if verbose:
print("INTERMEDIATE RESULTS (after bias):")
print(tensor)

if has_relu:
tensor = F.relu(tensor)

@@ -118,6 +128,7 @@ def convolution(
bias_type,
has_bias,
has_relu,
verbose,
)

return output
2 changes: 0 additions & 2 deletions test/NeurekaMemoryLayout.py
Original file line number Diff line number Diff line change
@@ -20,8 +20,6 @@
import numpy as np
import numpy.typing as npt

from TestClasses import IntegerType


class NeurekaMemoryLayout:
_WEIGHT_BANDWIDTH = 256
12 changes: 9 additions & 3 deletions test/NnxTestClasses.py
Original file line number Diff line number Diff line change
@@ -254,16 +254,22 @@ def from_conf(
).type(torch.int32)
if global_shift is None:
global_shift = torch.Tensor([0]).type(torch.int32)
conv_kwargs = {
**conf.__dict__,
"out_type": NeuralEngineFunctionalModel.ACCUMULATOR_TYPE,
}
output = NeuralEngineFunctionalModel().convolution(
input,
weight,
scale,
bias,
global_shift,
verbose=verbose,
**conf.__dict__,
verbose=False,
**conv_kwargs,
)
global_shift = NnxTestGenerator._calculate_global_shift(
output, conf.out_type
)
NnxTestGenerator._calculate_global_shift(output, conf.out_type)

output = NeuralEngineFunctionalModel().convolution(
input, weight, scale, bias, global_shift, verbose=verbose, **conf.__dict__
Loading

0 comments on commit 99cc182

Please sign in to comment.