diff --git a/include/hc.hpp b/include/hc.hpp index 5817321b902..309670c0f73 100644 --- a/include/hc.hpp +++ b/include/hc.hpp @@ -2817,7 +2817,10 @@ extern "C" inline __attribute((always_inline)) std::uint64_t __cycle_u64() __HC_ * * @return The result will be in the range 0 to WAVESIZE - 1. */ -extern "C" unsigned int __activelaneid_u32() __HC__; +extern "C" unsigned int __ockl_activelane_u32(void); +extern "C" inline unsigned int __activelaneid_u32() __HC__ { + return __ockl_activelane_u32(); +} /** * Return a bit mask shows which active work-items in the diff --git a/tests/Unit/AMDGPU/activelaneid.cpp b/tests/Unit/AMDGPU/activelaneid.cpp new file mode 100644 index 00000000000..bc7eaf689fe --- /dev/null +++ b/tests/Unit/AMDGPU/activelaneid.cpp @@ -0,0 +1,107 @@ + +// RUN: %hc %s -o %t.out && %t.out + +#include + +#include +#include +#include + +#define WAVEFRONT_SIZE (64) // as of now, all HSA agents have wavefront size of 64 + +#define GRID_SIZE (WAVEFRONT_SIZE * WAVEFRONT_SIZE) + +#define TEST_DEBUG (0) + +// A test case to verify builtin function +// - __activelaneid_u32 + +// test __activelaneid_u32 +bool test() { + using namespace hc; + bool ret = true; + + // initialize test data + // test is a table of size WAVEFRONT_SIZE * WAVEFRONT_SIZE + std::vector test(GRID_SIZE); + + std::random_device rd; + std::uniform_int_distribution int_dist(0, WAVEFRONT_SIZE - 1); + + // for each block of WAVEFRONT_SIZE, we randomly set 1s inside the block + // the number of 1s in the block equals to the index of the block + // (the 1st block of WAVEFRONT_SIZE has 0 1s, the 2nd block of WAVEFRONT_SIZE has 1 1, and so on) + for (int i = 0; i < WAVEFRONT_SIZE; ++i) { + for (int j = 0; j < WAVEFRONT_SIZE; ++j) { + if (j < i) { + test[i * WAVEFRONT_SIZE + j] = 1; + } else { + test[i * WAVEFRONT_SIZE + j] = 0; + } + } + + + for (int j = 0; j < WAVEFRONT_SIZE * 10; ++j) { + int k1 = int_dist(rd); + int k2 = int_dist(rd); + if (k1 != k2) { + test[i * WAVEFRONT_SIZE + k1] ^= test[i * WAVEFRONT_SIZE + k2] ^= test[i * WAVEFRONT_SIZE + k1] ^= test[i * WAVEFRONT_SIZE + k2]; } + } + + } + +#if TEST_DEBUG + for (int i = 0; i < WAVEFRONT_SIZE; ++i) { + for (int j = 0; j < WAVEFRONT_SIZE; ++j) { + std::cout << test[i * WAVEFRONT_SIZE + j] << " "; + } + std::cout << "\n"; + } +#endif + + array test_GPU(GRID_SIZE); + copy(test.begin(), test_GPU); + + array output_GPU(GRID_SIZE); + extent<1> ex(GRID_SIZE); + parallel_for_each(ex, [&](index<1>& idx) [[hc]] { + if (test_GPU[idx] == 1) + output_GPU(idx) = __activelaneid_u32(); + else + output_GPU(idx) = 99; + }).wait(); + + // verify result + std::vector output = output_GPU; + for (int i = 0; i < WAVEFRONT_SIZE; ++i) { + int activeLaneID = 0; + for (int j = 0; j < WAVEFRONT_SIZE; ++j) { + if (test[i * WAVEFRONT_SIZE +j] == 1) + ret &= (output[i * WAVEFRONT_SIZE + j] == activeLaneID++); + else + ret &= (output[i * WAVEFRONT_SIZE +j] == 99); + +#if TEST_DEBUG + if (!ret) { + std::cout << "FAILED: laneid " << activeLaneID << " "; + ret = true; + } + std::cout << "i: " << i << " j: " << j << " input = " << test[i * WAVEFRONT_SIZE +j] << " j = " << j << " and output = " << output[i * WAVEFRONT_SIZE + j] << "\n"; +#endif + } +#if TEST_DEBUG + std::cout << "\n"; +#endif + } + + return ret; +} + +int main() { + bool ret = true; + + ret &= test(); + + return !(ret == true); +} +