Implement NEON filter RGB Euclidean

PokemonAutomation · Nov 23, 2023 · 24c67d4 · 24c67d4
1 parent f7c0107
commit 24c67d4
Show file tree

Hide file tree

Showing 3 changed files with 79 additions and 20 deletions.
diff --git a/SerialPrograms/Source/CommonFramework/ImageTools/ImageFilter.h b/SerialPrograms/Source/CommonFramework/ImageTools/ImageFilter.h
@@ -68,6 +68,7 @@ ImageRGB32 filter_rgb32_euclidean(
 //    `expected_color` with `replacement_color`.
 //  If `replace_color_within_range` is false, replace the color outside of the distance with the color `replacement_color`.
 //  Returns the # of pixels inside the distance.
+//  Note: the alpha channel of `image` and `expected_color` are ignored during computation.
 ImageRGB32 filter_rgb32_euclidean(
     size_t& pixels_in_range,
     const ImageViewRGB32& image,

diff --git a/SerialPrograms/Source/Kernels/ImageFilters/Kernels_ImageFilter_Basic_arm64_NEON.cpp b/SerialPrograms/Source/Kernels/ImageFilters/Kernels_ImageFilter_Basic_arm64_NEON.cpp
@@ -13,6 +13,7 @@
 #include "Kernels_ImageFilter_Basic_Routines.h"
 
 #include <iostream>
+#include <iomanip>
 
 namespace PokemonAutomation{
 namespace Kernels{
@@ -66,8 +67,8 @@ class ImageFilter_RgbRange_arm64_NEON{
         uint8x16_t cmp1 = vcgtq_u8(in_u8, m_maxs_u8);
         // cmp: if mins > pixel or pixel > maxs per color channel
         uint8x16_t cmp_u8 = vorrq_u8(cmp0, cmp1);
-        // cmp_32x4: if each pixel is within the range
-        // If a pixel is within [mins, maxs], its uint32_t in `cmp_32x4` is all 1 bits, otherwise, all 0 bits
+        // cmp_u32: if each pixel is within the range
+        // If a pixel is within [mins, maxs], its uint32_t in `cmp_u32` is all 1 bits, otherwise, all 0 bits
         uint32x4_t cmp_u32 = vceqq_u32(vreinterpretq_u32_u8(cmp_u8), m_zeros_u8);
         // Increase count for each pixel in range. Each uint32 lane is counted separately.
         // We achieve +=1 by substracting 0xFFFFFFFF
@@ -128,22 +129,70 @@ class ImageFilter_RgbEuclidean_arm64_NEON{
     static const size_t VECTOR_SIZE = 4;
 
 public:
-    ImageFilter_RgbEuclidean_arm64_NEON(uint32_t expected, double max_euclidean_distance, uint32_t replacement_color, bool replace_color_within_range)
+    ImageFilter_RgbEuclidean_arm64_NEON(uint32_t expected_color, double max_euclidean_distance,
+        uint32_t replacement_color, bool replace_color_within_range)
+        : m_expected_color_g_s16(vreinterpretq_s16_u32(vdupq_n_u32((expected_color >> 8) & 0x000000ff)))
+        , m_expected_color_rb_s16(vreinterpretq_s16_u32(vdupq_n_u32(expected_color & 0x00ff00ff)))
+        , m_distance_squared_u32(vdupq_n_u32((uint32_t)(max_euclidean_distance * max_euclidean_distance)))
+        , m_replacement_color_u32(vdupq_n_u32(replacement_color))
+        , m_replace_color_within_range(replace_color_within_range)
+        , m_count_u32(vdupq_n_u32(0))
     {}
 
     PA_FORCE_INLINE size_t count() const{
-        return 0;
+        uint64x2_t sum_u64 = vpaddlq_u32(m_count_u32);
+        return sum_u64[0] + sum_u64[1];
     }
 
     PA_FORCE_INLINE void process_full(uint32_t* out, const uint32_t* in){
+        uint32x4_t in_u32 = vld1q_u32(in);
+        // Get green channel
+        uint32x4_t in_g_u32 = vandq_u32(in_u32, vdupq_n_u32(0x0000ff00));
+        // Move green channel to the lower end of the 16-bit regions
+        uint16x8_t in_g_u16 = vshrq_n_u16(vreinterpretq_u16_u32(in_g_u32), 8);
+        // in_rb_u16 contains the red and blue channels. Each channel occupies a 16-bit region
+        uint16x8_t in_rb_u16 = vandq_u16(vreinterpretq_u16_u32(in_u32), vdupq_n_u16(0x00ff));
+        // subtract the expected values
+        int16x8_t in_g_s16 = vsubq_s16(vreinterpretq_s16_u16(in_g_u16), m_expected_color_g_s16);
+        int16x8_t in_rb_s16 = vsubq_s16(vreinterpretq_s16_u16(in_rb_u16), m_expected_color_rb_s16);
+
+        // Square operation
+        uint16x8_t in_g2_u16 = vreinterpretq_u16_s16(vmulq_s16(in_g_s16, in_g_s16));
+        uint16x8_t in_r2b2_u16 = vreinterpretq_u16_s16(vmulq_s16(in_rb_s16, in_rb_s16));
+        // Use pairwise addition operator vpaddlq_u16 to convert each g2 into 32-bit.
+        int32x4_t in_g2_u32 = vpaddlq_u16(in_g2_u16);
+        // Use pairwise addition and accumulate to add r2, g2, and b2 together
+        uint32x4_t sum_sqr_u32 = vpadalq_u16(in_g2_u32, in_r2b2_u16);
+
+        // cmp_u32: if each pixel is within the range, its uint32_t in `cmp_u32` is all 1 bits, otherwise, all 0 bits
+        uint32x4_t cmp_u32 = vcleq_u32(sum_sqr_u32, m_distance_squared_u32);
+        // Increase count for each pixel in range. Each uint32 lane is counted separately.
+        // We achieve +=1 by substracting 0xFFFFFFFF
+        m_count_u32 = vsubq_u32(m_count_u32, cmp_u32);
+        // select replacement color or in_u8 based on cmp_u32:
+        uint32x4_t out_u8;
+        if (m_replace_color_within_range){
+            // vbslq_u32(a, b, c) for 1 bits in a, choose b; for 0 bits in a, choose c
+            out_u8 = vbslq_u32(cmp_u32, m_replacement_color_u32, in_u32);
+        } else{
+            out_u8 = vbslq_u32(cmp_u32, in_u32, m_replacement_color_u32);
+        }
+        vst1q_u32(out, out_u8);
     }
     PA_FORCE_INLINE void process_partial(uint32_t* out, const uint32_t* in, size_t left){
+        uint32_t buffer_in[4], buffer_out[4];
+        memcpy(buffer_in, in, sizeof(uint32_t) * left);
+        process_full(buffer_out, buffer_in);
+        memcpy(out, buffer_out, sizeof(uint32_t) * left);
     }
 
 private:
-    PA_FORCE_INLINE int process_word(int pixel){
-        return 0;
-    }
+    int8x16_t m_expected_color_g_s16;
+    int8x16_t m_expected_color_rb_s16;
+    uint32x4_t m_distance_squared_u32;
+    uint32x4_t m_replacement_color_u32;
+    bool m_replace_color_within_range;
+    uint32x4_t m_count_u32;
 
 private:
 
@@ -154,11 +203,11 @@ size_t filter_rgb32_euclidean_arm64_NEON(
     uint32_t expected, double max_euclidean_distance,
     uint32_t replacement_color, bool replace_color_within_range
 ){
-    return filter_rgb32_euclidean_Default(in, in_bytes_per_row, width, height,
-        out, out_bytes_per_row, expected, max_euclidean_distance, replacement_color, replace_color_within_range);
-    // ImageFilter_RgbEuclidean_arm64_NEON filter(expected, max_euclidean_distance, replacement_color, replace_color_within_range);
-    // filter_per_pixel(in, in_bytes_per_row, width, height, filter, out, out_bytes_per_row);
-    // return filter.count();
+    // return filter_rgb32_euclidean_Default(in, in_bytes_per_row, width, height,
+    //     out, out_bytes_per_row, expected, max_euclidean_distance, replacement_color, replace_color_within_range);
+    ImageFilter_RgbEuclidean_arm64_NEON filter(expected, max_euclidean_distance, replacement_color, replace_color_within_range);
+    filter_per_pixel(in, in_bytes_per_row, width, height, filter, out, out_bytes_per_row);
+    return filter.count();
 }
 
 

diff --git a/SerialPrograms/Source/Kernels/ImageFilters/Kernels_ImageFilter_Basic_x64_SSE42.cpp b/SerialPrograms/Source/Kernels/ImageFilters/Kernels_ImageFilter_Basic_x64_SSE42.cpp
@@ -103,7 +103,7 @@ class ImageFilter_RgbEuclidean_x64_SSE42{
     ImageFilter_RgbEuclidean_x64_SSE42(uint32_t expected, double max_euclidean_distance, uint32_t replacement, bool invert)
         : m_replacement(_mm_set1_epi32(replacement))
         , m_invert(invert ? _mm_set1_epi32(-1) : _mm_setzero_si128())
-        , m_expected_ag(_mm_set1_epi32((expected >> 8) & 0x000000ff))
+        , m_expected_g(_mm_set1_epi32((expected >> 8) & 0x000000ff))
         , m_expected_rb(_mm_set1_epi32(expected & 0x00ff00ff))
         , m_distance_squared(_mm_set1_epi32((uint32_t)(max_euclidean_distance * max_euclidean_distance)))
         , m_count(_mm_setzero_si128())
@@ -131,17 +131,26 @@ class ImageFilter_RgbEuclidean_x64_SSE42{
 
 private:
     PA_FORCE_INLINE __m128i process_word(__m128i pixel){
-        __m128i ag = _mm_and_si128(_mm_srli_epi16(pixel, 8), _mm_set1_epi32(0x000000ff));
+        // _mm_srli_epi16: Shift 16-bit integers in pixels right by 8 while shifting in zeros, 
+        // g: green channels of each pixel, but shifted right by 8 bits
+        __m128i g = _mm_and_si128(_mm_srli_epi16(pixel, 8), _mm_set1_epi32(0x000000ff));
+        // rb: the red and blue channels of each pixel
         __m128i rb = _mm_and_si128(pixel, _mm_set1_epi32(0x00ff00ff));
-
-        ag = _mm_sub_epi16(ag, m_expected_ag);
+        // g: the difference between input pixel channels and the expected values
+        g = _mm_sub_epi16(g, m_expected_g);
         rb = _mm_sub_epi16(rb, m_expected_rb);
-
-        __m128i g = _mm_mullo_epi16(ag, ag);
+        // compute square operation: 
+        // now each 16-bit region is a squared channel difference
+        // here we assume alpha channel from input image is the same as the expected,
+        // so the alpha channel difference is always 0, therefore:
+        // g: each 32-bit integer contains the green channel squared difference
+        __m128i g = _mm_mullo_epi16(g, g);
         rb = _mm_mullo_epi16(rb, rb);
+        // r: each 32-bit integer contains the red channel squared difference
         __m128i r = _mm_srli_epi32(rb, 16);
+        // b: each 32-bit integer contains the blue channel squared difference
         __m128i b = _mm_and_si128(rb, _mm_set1_epi32(0x0000ffff));
-
+        // compute r^2 + g^2 + b^2
         __m128i sum_sqr = _mm_add_epi32(r, g);
         sum_sqr = _mm_add_epi32(sum_sqr, b);
 
@@ -155,7 +164,7 @@ class ImageFilter_RgbEuclidean_x64_SSE42{
 private:
     const __m128i m_replacement;
     const __m128i m_invert;
-    const __m128i m_expected_ag;
+    const __m128i m_expected_g;
     const __m128i m_expected_rb;
     const __m128i m_distance_squared;
     __m128i m_count;