Skip to content

Commit

Permalink
Silly optimizations to remove a ton of inner branches.
Browse files Browse the repository at this point in the history
  • Loading branch information
Mysticial committed Dec 11, 2024
1 parent ca6220a commit 763cb1c
Showing 1 changed file with 144 additions and 72 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ PA_FORCE_INLINE void filter_per_pixel(

const size_t VECTOR_SIZE = Runner::VECTOR_SIZE;

// Less than vector width. No need for steady-state loop.
if (width < VECTOR_SIZE){
typename Runner::Mask mask(width);
do{
Expand All @@ -48,9 +49,29 @@ PA_FORCE_INLINE void filter_per_pixel(
image = (const uint32_t*)((const char*)image + in_bytes_per_row);
out = (uint32_t*)((const char*)out + out_bytes_per_row);
}while (--height);
return;
}

}else{
size_t left = width % VECTOR_SIZE;
// Divisible by vector width. No need for peel iteration.
size_t left = width % VECTOR_SIZE;
if (left == 0){
do{
const uint32_t* in = image;
uint32_t* o0 = out;
size_t lc = width / VECTOR_SIZE;
do{
filter.process_full(o0, in);
in += VECTOR_SIZE;
o0 += VECTOR_SIZE;
}while (--lc);
image = (const uint32_t*)((const char*)image + in_bytes_per_row);
out = (uint32_t*)((const char*)out + out_bytes_per_row);
}while (--height);
return;
}

// Need both steady-state and peel iteration.
{
do{
const uint32_t* in = image;
uint32_t* o0 = out;
Expand All @@ -60,9 +81,7 @@ PA_FORCE_INLINE void filter_per_pixel(
in += VECTOR_SIZE;
o0 += VECTOR_SIZE;
}while (--lc);
if (left != 0){
filter.process_partial(o0, in, left);
}
filter.process_partial(o0, in, left);
image = (const uint32_t*)((const char*)image + in_bytes_per_row);
out = (uint32_t*)((const char*)out + out_bytes_per_row);
}while (--height);
Expand All @@ -88,47 +107,73 @@ PA_FORCE_INLINE void filter_per_pixel(
}

const size_t VECTOR_SIZE = Runner::VECTOR_SIZE;
do{
// Less than vector width. No need for steady-state loop.
if (width < VECTOR_SIZE){
typename Runner::Mask mask(width);
do{
const uint32_t* in = image;
for (size_t c = 0; c < filter_count; c++){
entries[c].process_partial(filters[c].data, in, mask);
}
image = (const uint32_t*)((const char*)image + bytes_per_row);
for (size_t c = 0; c < filter_count; c++){
FilterRgb32RangeFilter& filter = filters[c];
filter.data = (uint32_t*)((const char*)filter.data + filter.bytes_per_row);
}
}while (--height);
break;
}

if (width < VECTOR_SIZE){
typename Runner::Mask mask(width);
do{
const uint32_t* in = image;
for (size_t c = 0; c < filter_count; c++){
entries[c].process_partial(filters[c].data, in, mask);
}
image = (const uint32_t*)((const char*)image + bytes_per_row);
for (size_t c = 0; c < filter_count; c++){
FilterRgb32RangeFilter& filter = filters[c];
filter.data = (uint32_t*)((const char*)filter.data + filter.bytes_per_row);
}
}while (--height);

}else{
// Divisible by vector width. No need for peel loop.
size_t left = width % VECTOR_SIZE;
do{
const uint32_t* in = image;
size_t shift = 0;
size_t lc = width / VECTOR_SIZE;
if (left == 0){
do{
const uint32_t* in = image;
size_t shift = 0;
size_t lc = width / VECTOR_SIZE;
do{
for (size_t c = 0; c < filter_count; c++){
entries[c].process_full(filters[c].data + shift, in);
}
in += VECTOR_SIZE;
shift += VECTOR_SIZE;
}while (--lc);
image = (const uint32_t*)((const char*)image + bytes_per_row);
for (size_t c = 0; c < filter_count; c++){
entries[c].process_full(filters[c].data + shift, in);
FilterRgb32RangeFilter& filter = filters[c];
filter.data = (uint32_t*)((const char*)filter.data + filter.bytes_per_row);
}
in += VECTOR_SIZE;
shift += VECTOR_SIZE;
}while (--lc);
if (left != 0){
}while (--height);
break;
}

// Need both steady-state and peel loops.
{
typename Runner::Mask mask(left);
do{
const uint32_t* in = image;
size_t shift = 0;
size_t lc = width / VECTOR_SIZE;
do{
for (size_t c = 0; c < filter_count; c++){
entries[c].process_full(filters[c].data + shift, in);
}
in += VECTOR_SIZE;
shift += VECTOR_SIZE;
}while (--lc);
for (size_t c = 0; c < filter_count; c++){
entries[c].process_partial(filters[c].data + shift, in, left);
entries[c].process_partial(filters[c].data + shift, in, mask);
}
}
image = (const uint32_t*)((const char*)image + bytes_per_row);
for (size_t c = 0; c < filter_count; c++){
FilterRgb32RangeFilter& filter = filters[c];
filter.data = (uint32_t*)((const char*)filter.data + filter.bytes_per_row);
}
}while (--height);
}

image = (const uint32_t*)((const char*)image + bytes_per_row);
for (size_t c = 0; c < filter_count; c++){
FilterRgb32RangeFilter& filter = filters[c];
filter.data = (uint32_t*)((const char*)filter.data + filter.bytes_per_row);
}
}while (--height);
break;
}
}while (false);

for (size_t c = 0; c < filter_count; c++){
filters[c].pixels_in_range = entries[c].count();
Expand All @@ -154,46 +199,73 @@ PA_FORCE_INLINE void to_blackwhite_rbg32(
}

const size_t VECTOR_SIZE = Runner::VECTOR_SIZE;
do{
// Less than vector width. No need for steady-state loop.
if (width < VECTOR_SIZE){
typename Runner::Mask mask(width);
do{
const uint32_t* in = image;
for (size_t c = 0; c < filter_count; c++){
entries[c].process_partial(filters[c].data, in, mask);
}
image = (const uint32_t*)((const char*)image + bytes_per_row);
for (size_t c = 0; c < filter_count; c++){
ToBlackWhiteRgb32RangeFilter& filter = filters[c];
filter.data = (uint32_t*)((const char*)filter.data + filter.bytes_per_row);
}
}while (--height);
break;
}

if (width < VECTOR_SIZE){
typename Runner::Mask mask(width);
do{
const uint32_t* in = image;
for (size_t c = 0; c < filter_count; c++){
entries[c].process_partial(filters[c].data, in, mask);
}
image = (const uint32_t*)((const char*)image + bytes_per_row);
for (size_t c = 0; c < filter_count; c++){
ToBlackWhiteRgb32RangeFilter& filter = filters[c];
filter.data = (uint32_t*)((const char*)filter.data + filter.bytes_per_row);
}
}while (--height);

}else{
// Divisible by vector width. No need for peel loop.
size_t left = width % VECTOR_SIZE;
do{
const uint32_t* in = image;
size_t shift = 0;
size_t lc = width / VECTOR_SIZE;
if (left == 0){
do{
const uint32_t* in = image;
size_t shift = 0;
size_t lc = width / VECTOR_SIZE;
do{
for (size_t c = 0; c < filter_count; c++){
entries[c].process_full(filters[c].data + shift, in);
}
in += VECTOR_SIZE;
shift += VECTOR_SIZE;
}while (--lc);
image = (const uint32_t*)((const char*)image + bytes_per_row);
for (size_t c = 0; c < filter_count; c++){
entries[c].process_full(filters[c].data + shift, in);
ToBlackWhiteRgb32RangeFilter& filter = filters[c];
filter.data = (uint32_t*)((const char*)filter.data + filter.bytes_per_row);
}
in += VECTOR_SIZE;
shift += VECTOR_SIZE;
}while (--lc);
if (left != 0){
}while (--height);
break;
}

// Need both steady-state and peel loops.
{
typename Runner::Mask mask(left);
do{
const uint32_t* in = image;
size_t shift = 0;
size_t lc = width / VECTOR_SIZE;
do{
for (size_t c = 0; c < filter_count; c++){
entries[c].process_full(filters[c].data + shift, in);
}
in += VECTOR_SIZE;
shift += VECTOR_SIZE;
}while (--lc);
for (size_t c = 0; c < filter_count; c++){
entries[c].process_partial(filters[c].data + shift, in, left);
entries[c].process_partial(filters[c].data + shift, in, mask);
}
}
image = (const uint32_t*)((const char*)image + bytes_per_row);
for (size_t c = 0; c < filter_count; c++){
ToBlackWhiteRgb32RangeFilter& filter = filters[c];
filter.data = (uint32_t*)((const char*)filter.data + filter.bytes_per_row);
}
}while (--height);
}
image = (const uint32_t*)((const char*)image + bytes_per_row);
for (size_t c = 0; c < filter_count; c++){
ToBlackWhiteRgb32RangeFilter& filter = filters[c];
filter.data = (uint32_t*)((const char*)filter.data + filter.bytes_per_row);
}
}while (--height);
break;
}
}while (false);

for (size_t c = 0; c < filter_count; c++){
filters[c].pixels_in_range = entries[c].count();
Expand Down

0 comments on commit 763cb1c

Please sign in to comment.