Skip to content

Commit

Permalink
Merge pull request #973 from calad0i/latency_pool
Browse files Browse the repository at this point in the history
Latency Pooling Header Updates
  • Loading branch information
jmitrevs authored Jun 28, 2024
2 parents 0236eff + 4940739 commit 9e9944f
Show file tree
Hide file tree
Showing 6 changed files with 109 additions and 191 deletions.
26 changes: 0 additions & 26 deletions hls4ml/backends/vivado/vivado_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@
Embedding,
GarNet,
GarNetStack,
GlobalPooling1D,
GlobalPooling2D,
Layer,
Pooling1D,
Pooling2D,
Expand All @@ -31,7 +29,6 @@
from hls4ml.model.optimizer import get_backend_passes, layer_optimizer
from hls4ml.model.types import FixedPrecisionType, IntegerPrecisionType, NamedType, PackedType
from hls4ml.report import parse_vivado_report
from hls4ml.utils.fixed_point_utils import ceil_log2


class VivadoBackend(FPGABackend):
Expand Down Expand Up @@ -413,37 +410,14 @@ def init_depconv2d(self, layer):

layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower())

def _set_pooling_accum_t(self, layer, pool_size):
extra_bits = ceil_log2(pool_size)
accum_t = layer.get_attr('accum_t')
accum_t.precision.width += extra_bits * 2
if isinstance(accum_t.precision, FixedPrecisionType):
accum_t.precision.integer += extra_bits

@layer_optimizer(Pooling1D)
def init_pooling1d(self, layer):
pool_size = layer.get_attr('pool_width')
self._set_pooling_accum_t(layer, pool_size)

layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower())

@layer_optimizer(Pooling2D)
def init_pooling2d(self, layer):
pool_size = layer.get_attr('pool_height') * layer.get_attr('pool_width')
self._set_pooling_accum_t(layer, pool_size)

layer.set_attr('implementation', layer.model.config.get_conv_implementation(layer).lower())

@layer_optimizer(GlobalPooling1D)
def init_global_pooling1d(self, layer):
pool_size = layer.get_attr('n_in')
self._set_pooling_accum_t(layer, pool_size)

@layer_optimizer(GlobalPooling2D)
def init_global_pooling2d(self, layer):
pool_size = layer.get_attr('in_height') * layer.get_attr('in_width')
self._set_pooling_accum_t(layer, pool_size)

@layer_optimizer(Softmax)
def init_softmax(self, layer):
if layer.model.config.get_config_value('IOType') == 'io_parallel':
Expand Down
136 changes: 55 additions & 81 deletions hls4ml/templates/vitis/nnet_utils/nnet_pooling.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,60 +8,40 @@
namespace nnet {

// Return the maximum value from an array
template <typename T, int N> T max(T x[N]) {
template <typename T, int N, typename accum_t> accum_t max(T x[N]) {
T y = x[0];
for (int i = 1; i < N; i++) {
y = x[i] > y ? x[i] : y;
}
return y;
}

template <int W, int N> ap_int<W> avg(ap_int<W> (&x)[N]) {
// Use a wider accumulator than the input to avoid overflow
ap_int<W + ceillog2(N)> tmp = 0;
for (int i = 0; i < N; i++) {
tmp += x[i];
}
tmp /= N;
// Now cast back to original type
ap_int<W> y = tmp;
return tmp;
}

template <int W, int I, int N> ap_fixed<W, I> avg(ap_fixed<W, I> (&x)[N]) {
// Use a wider accumulator than the input to avoid overflow
ap_fixed<W + ceillog2(N), I + ceillog2(N)> tmp = 0;
for (int i = 0; i < N; i++) {
tmp += x[i];
}
tmp /= N;
// Now cast back to original type
ap_fixed<W, I> y = tmp;
return y;
}

// Return the mean value of an array
template <typename T, int N> T avg(T (&x)[N]) {
T y = 0;
template <typename T, int N, typename accum_t> accum_t avg(T (&x)[N], unsigned length) {
accum_t y = 0;
for (int i = 0; i < N; i++) {
y += x[i];
}
y /= N;
y /= length;
return y;
}

// Enumeration for pooling operation (max, avg, l2norm pooling)
enum Pool_Op { Max, Average }; // L2Norm };
template <typename T, int N, Pool_Op op> T pool_op(T (&x)[N]) {
template <typename T, int N, Pool_Op op, typename accum_t> accum_t pool_op(T (&x)[N], unsigned length) {
switch (op) {
case Max:
return max<T, N>(x);
return max<T, N, accum_t>(x);
case Average:
return avg(x);
return avg<T, N, accum_t>(x, length);
// case L2Norm: return l2norm<T, N>(x);
}
}

template <typename T, int N, Pool_Op op, typename accum_t> accum_t pool_op(T (&x)[N]) {
return pool_op<T, N, op, accum_t>(x, N);
}

template <typename T, Pool_Op op> T pad_val() {
/*---
*- In Tensorflow, pooling ignores the value in the padded cells
Expand Down Expand Up @@ -104,8 +84,10 @@ void pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONF

// TODO partition the arrays according to the reuse factor
const int limit = pool_op_limit_1d<CONFIG_T>();
#pragma HLS ALLOCATION function instances=pool_op<data_T, CONFIG_T::pool_width, CONFIG_T::pool_op> limit=limit
#pragma HLS ALLOCATION function instances=pool_op<data_T, CONFIG_T::pool_width, \
CONFIG_T::pool_op, typename CONFIG_T::accum_t> limit=limit
// Add any necessary padding

unsigned padded_width = CONFIG_T::n_in + CONFIG_T::pad_left + CONFIG_T::pad_right;
if (CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0) {
padded_width -= padded_width - (padded_width / CONFIG_T::stride_width * CONFIG_T::stride_width);
Expand All @@ -114,29 +96,22 @@ void pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T res[CONF
for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
// Loop over input image x in steps of stride
for (int ii = 0; ii < padded_width; ii += CONFIG_T::stride_width) {
unsigned overlap_pixel = 0;
data_T pool[CONFIG_T::pool_width];
// Keep track of number of pixels in image vs padding region
unsigned img_overlap = 0;
// Loop over pool window x
for (int jj = 0; jj < CONFIG_T::stride_width; jj++) {
if (ii + jj < CONFIG_T::pad_left || ii + jj >= (padded_width - CONFIG_T::pad_right)) {
// Add padding
pool[jj] = pad_val<data_T, CONFIG_T::pool_op>();
} else {
#pragma HLS ARRAY_PARTITION variable=pool complete dim=0

for (int jj = 0; jj < CONFIG_T::pool_width; jj++) {
if (ii + jj >= CONFIG_T::pad_left && ii + jj < CONFIG_T::n_in + CONFIG_T::pad_left) {
pool[jj] = data[(ii + jj - CONFIG_T::pad_left) * CONFIG_T::n_filt + ff];
img_overlap++;
}
overlap_pixel++;
} else
pool[jj] = pad_val<data_T, CONFIG_T::pool_op>();
}
// do the pooling
// TODO in the case of average pooling, need to reduce width to area of pool window
// not overlapping padding region

int patch_size = CONFIG_T::count_pad ? CONFIG_T::stride_width : overlap_pixel;

res[(ii / CONFIG_T::stride_width) * CONFIG_T::n_filt + ff] =
pool_op<data_T, CONFIG_T::pool_width, CONFIG_T::pool_op>(pool);
// If the pool op is Average, the zero-padding needs to be removed from the results
if (CONFIG_T::pool_op == Average) {
data_T rescale = static_cast<data_T>(CONFIG_T::pool_width) / img_overlap;
res[(ii / CONFIG_T::stride_width) * CONFIG_T::n_filt + ff] *= rescale;
}
pool_op<data_T, CONFIG_T::pool_width, CONFIG_T::pool_op, typename CONFIG_T::accum_t>(pool, patch_size);
}
}
}
Expand All @@ -150,15 +125,16 @@ void global_pooling1d_cl(data_T data[CONFIG_T::n_in * CONFIG_T::n_filt], res_T r

// TODO partition the arrays according to the reuse factor
const int limit = pool_op_limit_1d<CONFIG_T>();
#pragma HLS ALLOCATION function instances=pool_op<data_T, CONFIG_T::pool_width, CONFIG_T::pool_op> limit=limit
#pragma HLS ALLOCATION function instances=pool_op<data_T, CONFIG_T::pool_width, \
CONFIG_T::pool_op, typename CONFIG_T::accum_t> limit=limit

for (int ff = 0; ff < CONFIG_T::n_filt; ff++) {
data_T pool[CONFIG_T::n_in];
for (int jj = 0; jj < CONFIG_T::n_in; jj++) {
pool[jj] = data[jj * CONFIG_T::n_filt + ff];
}
// do the pooling
res[ff] = pool_op<data_T, CONFIG_T::n_in, CONFIG_T::pool_op>(pool);
res[ff] = pool_op<data_T, CONFIG_T::n_in, CONFIG_T::pool_op, typename CONFIG_T::accum_t>(pool);
}
}

Expand Down Expand Up @@ -199,8 +175,7 @@ void pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_
// TODO partition the arrays according to the reuse factor
const int limit = pool_op_limit<CONFIG_T>();
#pragma HLS ALLOCATION function instances=pool_op<data_T, CONFIG_T::pool_height*CONFIG_T::pool_width, \
CONFIG_T::pool_op> limit=limit
// Add any necessary padding
CONFIG_T::pool_op, typename CONFIG_T::accum_t> limit=limit
unsigned padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom;
unsigned padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right;
if (CONFIG_T::pad_top == 0 && CONFIG_T::pad_bottom == 0 && CONFIG_T::pad_left == 0 && CONFIG_T::pad_right == 0) {
Expand All @@ -214,37 +189,34 @@ void pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_
// Loop over input image x in steps of stride
for (int jj = 0; jj < padded_width; jj += CONFIG_T::stride_width) {
data_T pool[CONFIG_T::pool_height * CONFIG_T::pool_width];
// Keep track of number of pixels in image vs padding region
unsigned img_overlap = 0;
#pragma HLS ARRAY_PARTITION variable=pool complete dim=0

unsigned overlap_pixel = 0;

// Loop over pool window y
for (int kk = 0; kk < CONFIG_T::stride_height; kk++) {
// Loop over pool window x
for (int ll = 0; ll < CONFIG_T::stride_width; ll++) {
if (ii + kk < CONFIG_T::pad_top || ii + kk >= (padded_height - CONFIG_T::pad_bottom) ||
jj + ll < CONFIG_T::pad_left || jj + ll >= (padded_width - CONFIG_T::pad_right)) {
// Add padding
bool cond1 = ii + kk >= CONFIG_T::pad_top && ii + kk < CONFIG_T::in_height + CONFIG_T::pad_top;
bool cond2 = jj + ll >= CONFIG_T::pad_left && jj + ll < CONFIG_T::in_width + CONFIG_T::pad_left;
if (cond1 && cond2) {
unsigned data_idx =
((ii + kk - CONFIG_T::pad_top) * CONFIG_T::in_width + (jj + ll - CONFIG_T::pad_left)) *
CONFIG_T::n_filt +
ff;
pool[kk * CONFIG_T::stride_width + ll] = data[data_idx];
overlap_pixel++;
} else
pool[kk * CONFIG_T::stride_width + ll] = pad_val<data_T, CONFIG_T::pool_op>();
} else {
pool[kk * CONFIG_T::stride_width + ll] =
data[(ii + kk - CONFIG_T::pad_top) * CONFIG_T::in_width * CONFIG_T::n_filt +
(jj + ll - CONFIG_T::pad_left) * CONFIG_T::n_filt + ff];
img_overlap++;
}
}
}
// do the pooling
// TODO in the case of average pooling, need to reduce height * width to area of pool window
// not overlapping padding region

int patch_size = CONFIG_T::count_pad ? CONFIG_T::stride_width * CONFIG_T::stride_height : overlap_pixel;

res[(ii / CONFIG_T::stride_height) * CONFIG_T::out_width * CONFIG_T::n_filt +
(jj / CONFIG_T::stride_width) * CONFIG_T::n_filt + ff] =
pool_op<data_T, CONFIG_T::pool_height * CONFIG_T::pool_width, CONFIG_T::pool_op>(pool);
// If the pool op is Average, the zero-padding needs to be removed from the results
if (CONFIG_T::pool_op == Average) {
data_T rescale =
static_cast<data_T>(CONFIG_T::pool_height) * static_cast<data_T>(CONFIG_T::pool_width) / img_overlap;
res[(ii / CONFIG_T::stride_height) * CONFIG_T::out_width * CONFIG_T::n_filt +
(jj / CONFIG_T::stride_width) * CONFIG_T::n_filt + ff] *= rescale;
}
pool_op<data_T, CONFIG_T::pool_height * CONFIG_T::pool_width, CONFIG_T::pool_op,
typename CONFIG_T::accum_t>(pool, patch_size);
}
}
}
Expand All @@ -258,7 +230,7 @@ void pooling2d_cf(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_
// TODO partition the arrays according to the reuse factor
const int limit = pool_op_limit<CONFIG_T>();
#pragma HLS ALLOCATION function instances=pool_op<data_T, CONFIG_T::pool_height*CONFIG_T::pool_width, \
CONFIG_T::pool_op> limit=limit
CONFIG_T::pool_op, typename CONFIG_T::accum_t> limit=limit
// Add any necessary padding
unsigned padded_height = CONFIG_T::in_height + CONFIG_T::pad_top + CONFIG_T::pad_bottom;
unsigned padded_width = CONFIG_T::in_width + CONFIG_T::pad_left + CONFIG_T::pad_right;
Expand Down Expand Up @@ -296,7 +268,8 @@ void pooling2d_cf(data_T data[CONFIG_T::in_height * CONFIG_T::in_width * CONFIG_
// not overlapping padding region
res[(ii / CONFIG_T::stride_height) * CONFIG_T::out_width + (jj / CONFIG_T::stride_width) +
ff * CONFIG_T::out_height * CONFIG_T::out_width] =
pool_op<data_T, CONFIG_T::pool_height * CONFIG_T::pool_width, CONFIG_T::pool_op>(pool);
pool_op<data_T, CONFIG_T::pool_height * CONFIG_T::pool_width, CONFIG_T::pool_op,
typename CONFIG_T::accum_t>(pool);
// If the pool op is Average, the zero-padding needs to be removed from the results
if (CONFIG_T::pool_op == Average) {
data_T rescale =
Expand All @@ -321,7 +294,7 @@ void global_pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width *

const int limit = pool_op_limit<CONFIG_T>();
#pragma HLS ALLOCATION function instances=pool_op<data_T, CONFIG_T::pool_width * CONFIG_T::pool_height, \
CONFIG_T::pool_op> limit=limit
CONFIG_T::pool_op, typename CONFIG_T::accum_t> limit=limit

FiltLoop:
for (int filt = 0; filt < CONFIG_T::n_filt; filt++) {
Expand All @@ -332,7 +305,8 @@ void global_pooling2d_cl(data_T data[CONFIG_T::in_height * CONFIG_T::in_width *
pool[i] = data[i * CONFIG_T::n_filt + filt];
}

res[filt] = static_cast<res_T>(pool_op<data_T, CONFIG_T::in_height * CONFIG_T::in_width, CONFIG_T::pool_op>(pool));
res[filt] = static_cast<res_T>(
pool_op<data_T, CONFIG_T::in_height * CONFIG_T::in_width, CONFIG_T::pool_op, typename CONFIG_T::accum_t>(pool));
}
}

Expand Down
Loading

0 comments on commit 9e9944f

Please sign in to comment.