Skip to content

Commit

Permalink
Implement NEON filter RGB Euclidean
Browse files Browse the repository at this point in the history
  • Loading branch information
Gin committed Nov 23, 2023
1 parent f7c0107 commit 24c67d4
Show file tree
Hide file tree
Showing 3 changed files with 79 additions and 20 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ ImageRGB32 filter_rgb32_euclidean(
// `expected_color` with `replacement_color`.
// If `replace_color_within_range` is false, replace the color outside of the distance with the color `replacement_color`.
// Returns the # of pixels inside the distance.
// Note: the alpha channel of `image` and `expected_color` are ignored during computation.
ImageRGB32 filter_rgb32_euclidean(
size_t& pixels_in_range,
const ImageViewRGB32& image,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
#include "Kernels_ImageFilter_Basic_Routines.h"

#include <iostream>
#include <iomanip>

namespace PokemonAutomation{
namespace Kernels{
Expand Down Expand Up @@ -66,8 +67,8 @@ class ImageFilter_RgbRange_arm64_NEON{
uint8x16_t cmp1 = vcgtq_u8(in_u8, m_maxs_u8);
// cmp: if mins > pixel or pixel > maxs per color channel
uint8x16_t cmp_u8 = vorrq_u8(cmp0, cmp1);
// cmp_32x4: if each pixel is within the range
// If a pixel is within [mins, maxs], its uint32_t in `cmp_32x4` is all 1 bits, otherwise, all 0 bits
// cmp_u32: if each pixel is within the range
// If a pixel is within [mins, maxs], its uint32_t in `cmp_u32` is all 1 bits, otherwise, all 0 bits
uint32x4_t cmp_u32 = vceqq_u32(vreinterpretq_u32_u8(cmp_u8), m_zeros_u8);
// Increase count for each pixel in range. Each uint32 lane is counted separately.
// We achieve +=1 by substracting 0xFFFFFFFF
Expand Down Expand Up @@ -128,22 +129,70 @@ class ImageFilter_RgbEuclidean_arm64_NEON{
static const size_t VECTOR_SIZE = 4;

public:
ImageFilter_RgbEuclidean_arm64_NEON(uint32_t expected, double max_euclidean_distance, uint32_t replacement_color, bool replace_color_within_range)
ImageFilter_RgbEuclidean_arm64_NEON(uint32_t expected_color, double max_euclidean_distance,
uint32_t replacement_color, bool replace_color_within_range)
: m_expected_color_g_s16(vreinterpretq_s16_u32(vdupq_n_u32((expected_color >> 8) & 0x000000ff)))
, m_expected_color_rb_s16(vreinterpretq_s16_u32(vdupq_n_u32(expected_color & 0x00ff00ff)))
, m_distance_squared_u32(vdupq_n_u32((uint32_t)(max_euclidean_distance * max_euclidean_distance)))
, m_replacement_color_u32(vdupq_n_u32(replacement_color))
, m_replace_color_within_range(replace_color_within_range)
, m_count_u32(vdupq_n_u32(0))
{}

PA_FORCE_INLINE size_t count() const{
return 0;
uint64x2_t sum_u64 = vpaddlq_u32(m_count_u32);
return sum_u64[0] + sum_u64[1];
}

PA_FORCE_INLINE void process_full(uint32_t* out, const uint32_t* in){
uint32x4_t in_u32 = vld1q_u32(in);
// Get green channel
uint32x4_t in_g_u32 = vandq_u32(in_u32, vdupq_n_u32(0x0000ff00));
// Move green channel to the lower end of the 16-bit regions
uint16x8_t in_g_u16 = vshrq_n_u16(vreinterpretq_u16_u32(in_g_u32), 8);
// in_rb_u16 contains the red and blue channels. Each channel occupies a 16-bit region
uint16x8_t in_rb_u16 = vandq_u16(vreinterpretq_u16_u32(in_u32), vdupq_n_u16(0x00ff));
// subtract the expected values
int16x8_t in_g_s16 = vsubq_s16(vreinterpretq_s16_u16(in_g_u16), m_expected_color_g_s16);
int16x8_t in_rb_s16 = vsubq_s16(vreinterpretq_s16_u16(in_rb_u16), m_expected_color_rb_s16);

// Square operation
uint16x8_t in_g2_u16 = vreinterpretq_u16_s16(vmulq_s16(in_g_s16, in_g_s16));
uint16x8_t in_r2b2_u16 = vreinterpretq_u16_s16(vmulq_s16(in_rb_s16, in_rb_s16));
// Use pairwise addition operator vpaddlq_u16 to convert each g2 into 32-bit.
int32x4_t in_g2_u32 = vpaddlq_u16(in_g2_u16);
// Use pairwise addition and accumulate to add r2, g2, and b2 together
uint32x4_t sum_sqr_u32 = vpadalq_u16(in_g2_u32, in_r2b2_u16);

// cmp_u32: if each pixel is within the range, its uint32_t in `cmp_u32` is all 1 bits, otherwise, all 0 bits
uint32x4_t cmp_u32 = vcleq_u32(sum_sqr_u32, m_distance_squared_u32);
// Increase count for each pixel in range. Each uint32 lane is counted separately.
// We achieve +=1 by substracting 0xFFFFFFFF
m_count_u32 = vsubq_u32(m_count_u32, cmp_u32);
// select replacement color or in_u8 based on cmp_u32:
uint32x4_t out_u8;
if (m_replace_color_within_range){
// vbslq_u32(a, b, c) for 1 bits in a, choose b; for 0 bits in a, choose c
out_u8 = vbslq_u32(cmp_u32, m_replacement_color_u32, in_u32);
} else{
out_u8 = vbslq_u32(cmp_u32, in_u32, m_replacement_color_u32);
}
vst1q_u32(out, out_u8);
}
PA_FORCE_INLINE void process_partial(uint32_t* out, const uint32_t* in, size_t left){
uint32_t buffer_in[4], buffer_out[4];
memcpy(buffer_in, in, sizeof(uint32_t) * left);
process_full(buffer_out, buffer_in);
memcpy(out, buffer_out, sizeof(uint32_t) * left);
}

private:
PA_FORCE_INLINE int process_word(int pixel){
return 0;
}
int8x16_t m_expected_color_g_s16;
int8x16_t m_expected_color_rb_s16;
uint32x4_t m_distance_squared_u32;
uint32x4_t m_replacement_color_u32;
bool m_replace_color_within_range;
uint32x4_t m_count_u32;

private:

Expand All @@ -154,11 +203,11 @@ size_t filter_rgb32_euclidean_arm64_NEON(
uint32_t expected, double max_euclidean_distance,
uint32_t replacement_color, bool replace_color_within_range
){
return filter_rgb32_euclidean_Default(in, in_bytes_per_row, width, height,
out, out_bytes_per_row, expected, max_euclidean_distance, replacement_color, replace_color_within_range);
// ImageFilter_RgbEuclidean_arm64_NEON filter(expected, max_euclidean_distance, replacement_color, replace_color_within_range);
// filter_per_pixel(in, in_bytes_per_row, width, height, filter, out, out_bytes_per_row);
// return filter.count();
// return filter_rgb32_euclidean_Default(in, in_bytes_per_row, width, height,
// out, out_bytes_per_row, expected, max_euclidean_distance, replacement_color, replace_color_within_range);
ImageFilter_RgbEuclidean_arm64_NEON filter(expected, max_euclidean_distance, replacement_color, replace_color_within_range);
filter_per_pixel(in, in_bytes_per_row, width, height, filter, out, out_bytes_per_row);
return filter.count();
}


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ class ImageFilter_RgbEuclidean_x64_SSE42{
ImageFilter_RgbEuclidean_x64_SSE42(uint32_t expected, double max_euclidean_distance, uint32_t replacement, bool invert)
: m_replacement(_mm_set1_epi32(replacement))
, m_invert(invert ? _mm_set1_epi32(-1) : _mm_setzero_si128())
, m_expected_ag(_mm_set1_epi32((expected >> 8) & 0x000000ff))
, m_expected_g(_mm_set1_epi32((expected >> 8) & 0x000000ff))
, m_expected_rb(_mm_set1_epi32(expected & 0x00ff00ff))
, m_distance_squared(_mm_set1_epi32((uint32_t)(max_euclidean_distance * max_euclidean_distance)))
, m_count(_mm_setzero_si128())
Expand Down Expand Up @@ -131,17 +131,26 @@ class ImageFilter_RgbEuclidean_x64_SSE42{

private:
PA_FORCE_INLINE __m128i process_word(__m128i pixel){
__m128i ag = _mm_and_si128(_mm_srli_epi16(pixel, 8), _mm_set1_epi32(0x000000ff));
// _mm_srli_epi16: Shift 16-bit integers in pixels right by 8 while shifting in zeros,
// g: green channels of each pixel, but shifted right by 8 bits
__m128i g = _mm_and_si128(_mm_srli_epi16(pixel, 8), _mm_set1_epi32(0x000000ff));
// rb: the red and blue channels of each pixel
__m128i rb = _mm_and_si128(pixel, _mm_set1_epi32(0x00ff00ff));

ag = _mm_sub_epi16(ag, m_expected_ag);
// g: the difference between input pixel channels and the expected values
g = _mm_sub_epi16(g, m_expected_g);
rb = _mm_sub_epi16(rb, m_expected_rb);

__m128i g = _mm_mullo_epi16(ag, ag);
// compute square operation:
// now each 16-bit region is a squared channel difference
// here we assume alpha channel from input image is the same as the expected,
// so the alpha channel difference is always 0, therefore:
// g: each 32-bit integer contains the green channel squared difference
__m128i g = _mm_mullo_epi16(g, g);
rb = _mm_mullo_epi16(rb, rb);
// r: each 32-bit integer contains the red channel squared difference
__m128i r = _mm_srli_epi32(rb, 16);
// b: each 32-bit integer contains the blue channel squared difference
__m128i b = _mm_and_si128(rb, _mm_set1_epi32(0x0000ffff));

// compute r^2 + g^2 + b^2
__m128i sum_sqr = _mm_add_epi32(r, g);
sum_sqr = _mm_add_epi32(sum_sqr, b);

Expand All @@ -155,7 +164,7 @@ class ImageFilter_RgbEuclidean_x64_SSE42{
private:
const __m128i m_replacement;
const __m128i m_invert;
const __m128i m_expected_ag;
const __m128i m_expected_g;
const __m128i m_expected_rb;
const __m128i m_distance_squared;
__m128i m_count;
Expand Down

0 comments on commit 24c67d4

Please sign in to comment.