diff --git a/.github/workflows/breeze.yml b/.github/workflows/breeze.yml new file mode 100644 index 000000000000..0d778f6cac86 --- /dev/null +++ b/.github/workflows/breeze.yml @@ -0,0 +1,114 @@ +# Copyright (c) Facebook, Inc. and its affiliates. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +name: Breeze Linux Build + +on: + push: + branches: + - "main" + paths: + - "velox/experimental/breeze/**" + - "velox/external/perfetto/**" + - "CMake/**" + - "scripts/setup-ubuntu.sh" + - "scripts/setup-helper-functions.sh" + - ".github/workflows/breeze.yml" + + pull_request: + paths: + - "velox/experimental/breeze/**" + - "velox/external/perfetto/**" + - "CMake/**" + - "scripts/setup-ubuntu.sh" + - "scripts/setup-helper-functions.sh" + - ".github/workflows/breeze.yml" + +permissions: + contents: read + +concurrency: + group: ${{ github.workflow }}-${{ github.repository }}-${{ github.head_ref || github.sha }} + cancel-in-progress: true + +jobs: + ubuntu-debug: + runs-on: ubuntu-22.04 + # prevent errors when forks ff their main branch + if: ${{ github.repository == 'facebookincubator/velox' }} + name: "Ubuntu debug" + defaults: + run: + shell: bash + working-directory: velox + steps: + + - uses: actions/checkout@v4 + with: + path: velox + + - name: Install Dependencies + run: | + source scripts/setup-ubuntu.sh && install_apt_deps + + - name: Make Debug Build + env: + VELOX_DEPENDENCY_SOURCE: BUNDLED + # OpenMP build with asan+ubsan enabled + run: | + cmake -S velox/experimental/breeze -B _build-breeze/debug \ + -DCMAKE_BUILD_TYPE=Asan \ + -DCMAKE_CXX_FLAGS="-fsanitize=undefined" \ + -DBUILD_GENERATE_TEST_FIXTURES=OFF \ + -DBUILD_OPENMP=ON + cmake --build _build-breeze/debug -j 8 + + - name: Run Tests + run: | + cd _build-breeze/debug && ctest -j 8 --output-on-failure --no-tests=error + + ubuntu-gpu-relwithdebinfo: + runs-on: 4-core-ubuntu-gpu-t4 + # prevent errors when forks ff their main branch + if: ${{ github.repository == 'facebookincubator/velox' }} + name: "Ubuntu GPU debug" + env: + CUDA_VERSION: "12.2" + defaults: + run: + shell: bash + working-directory: velox + steps: + + - uses: actions/checkout@v4 + with: + path: velox + + - name: Install Dependencies + run: | + source scripts/setup-ubuntu.sh && install_apt_deps && install_cuda ${CUDA_VERSION} + sudo chmod 755 -R /usr/local/lib/python3.10/dist-packages + + - name: Make RelWithDebInfo Build + run: | + cmake -S velox/experimental/breeze -B _build-breeze/relwithdebinfo \ + -DCMAKE_BUILD_TYPE=RelWithDebInfo \ + -DBUILD_GENERATE_TEST_FIXTURES=OFF \ + -DBUILD_CUDA=ON \ + -DCMAKE_NVCC_FLAGS="-arch=native" + cmake --build _build-breeze/relwithdebinfo -j 8 + + - name: Run Tests + run: | + cd _build-breeze/relwithdebinfo && ctest -j 8 --output-on-failure --no-tests=error diff --git a/velox/experimental/breeze/CMakeLists.txt b/velox/experimental/breeze/CMakeLists.txt index b3beedf31a64..122f0c995b04 100644 --- a/velox/experimental/breeze/CMakeLists.txt +++ b/velox/experimental/breeze/CMakeLists.txt @@ -39,6 +39,7 @@ option(BUILD_OPENCL "Build OpenCL tests." OFF) option(BUILD_OPENMP "Build OpenMP tests." OFF) option(BUILD_METAL "Build Metal tests." OFF) option(BUILD_TRACING "Build tracing." ON) +option(BUILD_GENERATE_TEST_FIXTURES "Generate test fixtures at build time." ON) if(NOT DEFINED PERFTEST_EXT_TYPES) set(PERFTEST_EXT_TYPES diff --git a/velox/experimental/breeze/cmake/cuda.cmake b/velox/experimental/breeze/cmake/cuda.cmake index c4761b10dcd8..732a8a23aef2 100644 --- a/velox/experimental/breeze/cmake/cuda.cmake +++ b/velox/experimental/breeze/cmake/cuda.cmake @@ -96,7 +96,7 @@ function(breeze_add_cuda_test target source) OUTPUT ${target} COMMAND ${NVCC_EXECUTABLE} -o ${target} ${target}.o ${arg_LIBS} - $/libgtest.a + $/libgtest.a $/libtest_main.a $<$:$/libperfetto.a> ${ARCH_LINK_FLAGS} @@ -107,8 +107,6 @@ function(breeze_add_cuda_test target source) PROPERTY IMPORTED_LOCATION ${CMAKE_CURRENT_BINARY_DIR}/${target}) gtest_discover_tests(${target}_TESTS TEST_PREFIX cuda: DISCOVERY_MODE PRE_TEST) - install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${target} - DESTINATION ${CMAKE_INSTALL_BINDIR}) if(DEFINED CUDA_EXPECTED_RESOURCE_USAGE_DIR) if(EXISTS "${CUDA_EXPECTED_RESOURCE_USAGE_DIR}/${target}-expected.txt") set(GET_RESOURCE_USAGE_CMDLINE diff --git a/velox/experimental/breeze/cmake/hip.cmake b/velox/experimental/breeze/cmake/hip.cmake index 01c468110efc..f1462577138c 100644 --- a/velox/experimental/breeze/cmake/hip.cmake +++ b/velox/experimental/breeze/cmake/hip.cmake @@ -39,7 +39,7 @@ function(breeze_add_hip_test target source) OUTPUT ${target} COMMAND ${HIPCC_EXECUTABLE} -o ${target} ${target}.o - $/libgtest.a + $/libgtest.a $/libtest_main.a $<$:$/libperfetto.a> ${ARCH_LINK_FLAGS} @@ -49,6 +49,4 @@ function(breeze_add_hip_test target source) set_property(TARGET ${target}_TESTS PROPERTY IMPORTED_LOCATION ${CMAKE_CURRENT_BINARY_DIR}/${target}) gtest_discover_tests(${target}_TESTS TEST_PREFIX hip: DISCOVERY_MODE PRE_TEST) - install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${target} - DESTINATION ${CMAKE_INSTALL_BINDIR}) endfunction() diff --git a/velox/experimental/breeze/cmake/sycl.cmake b/velox/experimental/breeze/cmake/sycl.cmake index 345b473b30a5..51c26acfeeb2 100644 --- a/velox/experimental/breeze/cmake/sycl.cmake +++ b/velox/experimental/breeze/cmake/sycl.cmake @@ -40,7 +40,7 @@ function(breeze_add_sycl_test target source) OUTPUT ${target} COMMAND ${SYCLCC_EXECUTABLE} -o ${target} ${target}.o - $/libgtest.a + $/libgtest.a $/libtest_main.a ${CMAKE_THREAD_LIBS_INIT} ${ARCH_LINK_FLAGS} $<$:$/libperfetto.a> @@ -51,6 +51,4 @@ function(breeze_add_sycl_test target source) PROPERTY IMPORTED_LOCATION ${CMAKE_CURRENT_BINARY_DIR}/${target}) gtest_discover_tests(${target}_TESTS TEST_PREFIX sycl: DISCOVERY_MODE PRE_TEST) - install(PROGRAMS ${CMAKE_CURRENT_BINARY_DIR}/${target} - DESTINATION ${CMAKE_INSTALL_BINDIR}) endfunction() diff --git a/velox/experimental/breeze/test/CMakeLists.txt b/velox/experimental/breeze/test/CMakeLists.txt index 069f14e45bfd..0afdd04880c8 100644 --- a/velox/experimental/breeze/test/CMakeLists.txt +++ b/velox/experimental/breeze/test/CMakeLists.txt @@ -18,7 +18,7 @@ include_directories(${CMAKE_SOURCE_DIR}) -cxx_library(test_main "${cxx_strict}" test_main.cpp) +add_library(test_main test_main.cpp) target_link_libraries( test_main PUBLIC gtest $<$:perfetto>) @@ -160,101 +160,103 @@ function( endfunction() function(breeze_add_unittests dir type) - if(BUILD_OPENCL) - generate_kernel( - ${type}_opencl_kernels_src - ${dir} - ${type} - "opencl" - "generated/${dir}/kernels-opencl.h") - generate_test_fixture( - ${type}_opencl_test_fixture_src - ${dir} - ${type} - "opencl" - "generated/${dir}/${type}_test-opencl.h") - set(_opencl_kernels_src ${type}_opencl_kernels_src) - set(_opencl_test_fixture_src ${type}_opencl_test_fixture_src) - endif() - if(BUILD_METAL) - generate_kernel( - ${type}_metal_kernels_src - ${dir} - ${type} - "metal" - "generated/${dir}/kernels-metal.h") - generate_test_fixture( - ${type}_metal_test_fixture_src - ${dir} - ${type} - "metal" - "generated/${dir}/${type}_test-metal.h") - set(_metal_kernels_src ${type}_metal_kernels_src) - set(_metal_test_fixture_src ${type}_metal_test_fixture_src) - endif() - if(BUILD_CUDA) - generate_kernel( - ${type}_cuda_kernels_src - ${dir} - ${type} - "cuda" - "generated/${dir}/kernels.cuh") - generate_test_fixture( - ${type}_cuda_test_fixture_src - ${dir} - ${type} - "cuda" - "generated/${dir}/${type}_test-cuda.cuh") - set(_cuda_kernels_src ${type}_cuda_kernels_src) - set(_cuda_test_fixture_src ${type}_cuda_test_fixture_src) - endif() - if(BUILD_HIP) - generate_kernel( - ${type}_hip_kernels_src - ${dir} - ${type} - "hip" - "generated/${dir}/kernels-hip.hpp") - generate_test_fixture( - ${type}_hip_test_fixture_src - ${dir} - ${type} - "hip" - "generated/${dir}/${type}_test-hip.hpp") - set(_hip_kernels_src ${type}_hip_kernels_src) - set(_hip_test_fixture_src ${type}_hip_test_fixture_src) - endif() - if(BUILD_SYCL) - generate_kernel( - ${type}_sycl_kernels_src - ${dir} - ${type} - "sycl" - "generated/${dir}/kernels-sycl.hpp") - generate_test_fixture( - ${type}_sycl_test_fixture_src - ${dir} - ${type} - "sycl" - "generated/${dir}/${type}_test-sycl.hpp") - set(_sycl_kernels_src ${type}_sycl_kernels_src) - set(_sycl_test_fixture_src ${type}_sycl_test_fixture_src) - endif() - if(BUILD_OPENMP) - generate_kernel( - ${type}_openmp_kernels_src - ${dir} - ${type} - "openmp" - "generated/${dir}/kernels-openmp.h") - generate_test_fixture( - ${type}_openmp_test_fixture_src - ${dir} - ${type} - "openmp" - "generated/${dir}/${type}_test-openmp.h") - set(_openmp_kernels_src ${type}_openmp_kernels_src) - set(_openmp_test_fixture_src ${type}_openmp_test_fixture_src) + if(BUILD_GENERATE_TEST_FIXTURES) + if(BUILD_OPENCL) + generate_kernel( + ${type}_opencl_kernels_src + ${dir} + ${type} + "opencl" + "generated/${dir}/kernels-opencl.h") + generate_test_fixture( + ${type}_opencl_test_fixture_src + ${dir} + ${type} + "opencl" + "generated/${dir}/${type}_test-opencl.h") + set(_opencl_kernels_src ${type}_opencl_kernels_src) + set(_opencl_test_fixture_src ${type}_opencl_test_fixture_src) + endif() + if(BUILD_METAL) + generate_kernel( + ${type}_metal_kernels_src + ${dir} + ${type} + "metal" + "generated/${dir}/kernels-metal.h") + generate_test_fixture( + ${type}_metal_test_fixture_src + ${dir} + ${type} + "metal" + "generated/${dir}/${type}_test-metal.h") + set(_metal_kernels_src ${type}_metal_kernels_src) + set(_metal_test_fixture_src ${type}_metal_test_fixture_src) + endif() + if(BUILD_CUDA) + generate_kernel( + ${type}_cuda_kernels_src + ${dir} + ${type} + "cuda" + "generated/${dir}/kernels-cuda.cuh") + generate_test_fixture( + ${type}_cuda_test_fixture_src + ${dir} + ${type} + "cuda" + "generated/${dir}/${type}_test-cuda.cuh") + set(_cuda_kernels_src ${type}_cuda_kernels_src) + set(_cuda_test_fixture_src ${type}_cuda_test_fixture_src) + endif() + if(BUILD_HIP) + generate_kernel( + ${type}_hip_kernels_src + ${dir} + ${type} + "hip" + "generated/${dir}/kernels-hip.hpp") + generate_test_fixture( + ${type}_hip_test_fixture_src + ${dir} + ${type} + "hip" + "generated/${dir}/${type}_test-hip.hpp") + set(_hip_kernels_src ${type}_hip_kernels_src) + set(_hip_test_fixture_src ${type}_hip_test_fixture_src) + endif() + if(BUILD_SYCL) + generate_kernel( + ${type}_sycl_kernels_src + ${dir} + ${type} + "sycl" + "generated/${dir}/kernels-sycl.hpp") + generate_test_fixture( + ${type}_sycl_test_fixture_src + ${dir} + ${type} + "sycl" + "generated/${dir}/${type}_test-sycl.hpp") + set(_sycl_kernels_src ${type}_sycl_kernels_src) + set(_sycl_test_fixture_src ${type}_sycl_test_fixture_src) + endif() + if(BUILD_OPENMP) + generate_kernel( + ${type}_openmp_kernels_src + ${dir} + ${type} + "openmp" + "generated/${dir}/kernels-openmp.h") + generate_test_fixture( + ${type}_openmp_test_fixture_src + ${dir} + ${type} + "openmp" + "generated/${dir}/${type}_test-openmp.h") + set(_openmp_kernels_src ${type}_openmp_kernels_src) + set(_openmp_test_fixture_src ${type}_openmp_test_fixture_src) + endif() endif() breeze_add_tests( ${dir} diff --git a/velox/experimental/breeze/test/generate.sh b/velox/experimental/breeze/test/generate.sh index fc41e4d5edad..1d65793bd99b 100755 --- a/velox/experimental/breeze/test/generate.sh +++ b/velox/experimental/breeze/test/generate.sh @@ -30,3 +30,5 @@ function generate { generate openmp "algorithm" h generate openmp "function" h +generate cuda "algorithm" cuh +generate cuda "function" cuh diff --git a/velox/experimental/breeze/test/generated/algorithms/algorithm_test-cuda.cuh b/velox/experimental/breeze/test/generated/algorithms/algorithm_test-cuda.cuh new file mode 100644 index 000000000000..2f96445832d4 --- /dev/null +++ b/velox/experimental/breeze/test/generated/algorithms/algorithm_test-cuda.cuh @@ -0,0 +1,88 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2024 by Rivos Inc. + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * This file is auto-generated from test_fixture_generator.py + * DO NOT EDIT! + */ + +#include + +#include + +#include "breeze/platforms/cuda.cuh" +#include "test/generated/algorithms/kernels-cuda.cuh" +#include "test/platforms/cuda_test.cuh" + +template +class AlgorithmTest : public ::testing::Test { + protected: + template + void Reduce(const std::vector& in, U* out, int num_blocks) { + std::vector vec_out(1, *out); + CudaTestLaunch( + num_blocks, + &kernels::Reduce, in, + vec_out, in.size()); + *out = vec_out[0]; + } + + template + void Scan(const std::vector& in, std::vector& out, int* next_blocks_idx, + std::vector& blocks, int num_blocks) { + std::vector vec_next_blocks_idx(1, *next_blocks_idx); + CudaTestLaunch( + num_blocks, + &kernels::Scan, + in, out, vec_next_blocks_idx, blocks, in.size()); + *next_blocks_idx = vec_next_blocks_idx[0]; + } + + template + void RadixSortHistogram(const std::vector& in, std::vector& out, + int num_blocks) { + CudaTestLaunch( + num_blocks, + &kernels::RadixSortHistogram, + in, out, in.size()); + } + + template + void RadixSort(const std::vector& in, + const std::vector& in_offsets, int start_bit, + int num_pass_bits, std::vector& out, + std::vector& next_block_idx, + std::vector& blocks, int num_blocks) { + const std::vector vec_start_bit(1, start_bit); + const std::vector vec_num_pass_bits(1, num_pass_bits); + CudaTestLaunch( + num_blocks, + &kernels::RadixSort, in, + in_offsets, vec_start_bit, vec_num_pass_bits, out, next_block_idx, + blocks, in.size()); + } +}; diff --git a/velox/experimental/breeze/test/generated/algorithms/kernels-cuda.cuh b/velox/experimental/breeze/test/generated/algorithms/kernels-cuda.cuh new file mode 100644 index 000000000000..e3e864b90698 --- /dev/null +++ b/velox/experimental/breeze/test/generated/algorithms/kernels-cuda.cuh @@ -0,0 +1,120 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2024 by Rivos Inc. + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * This file is auto-generated from kernel_generator.py + * DO NOT EDIT! + */ + +#include "breeze/algorithms/reduce.h" +#include "breeze/algorithms/scan.h" +#include "breeze/algorithms/sort.h" +#include "breeze/platforms/cuda.cuh" +#include "breeze/platforms/platform.h" +#include "breeze/utils/types.h" + +namespace kernels { + +enum { WARP_THREADS = 32 }; +template +__global__ void Reduce(const T* in, U* out, int num_items) { + using PlatformT = CudaPlatform; + PlatformT p; + __shared__ + typename breeze::algorithms::DeviceReduce::Scratch scratch_; + auto scratch = + (typename breeze::algorithms::DeviceReduce::Scratch*)&scratch_; + + breeze::algorithms::DeviceReduce::template Reduce< + Op, ITEMS_PER_THREAD>( + p, breeze::utils::make_slice(in), + breeze::utils::make_slice(out), + breeze::utils::make_slice(scratch), num_items); +} + +template +__global__ void Scan(const T* in, U* out, int* next_block_idx, V* blocks, + int num_items) { + using PlatformT = CudaPlatform; + PlatformT p; + __shared__ typename breeze::algorithms::DeviceScan< + PlatformT, U, ITEMS_PER_THREAD, LOOKBACK_DISTANCE>::Scratch scratch_; + auto scratch = (typename breeze::algorithms::DeviceScan< + PlatformT, U, ITEMS_PER_THREAD, + LOOKBACK_DISTANCE>::Scratch*)&scratch_; + + breeze::algorithms::DeviceScan:: + template Scan( + p, breeze::utils::make_slice(in), + breeze::utils::make_slice(out), + breeze::utils::make_slice(next_block_idx), + breeze::utils::make_slice(blocks), + breeze::utils::make_slice(scratch), num_items); +} + +template +__global__ void RadixSortHistogram(const T* in, unsigned* out, int num_items) { + using PlatformT = CudaPlatform; + PlatformT p; + __shared__ typename breeze::algorithms::DeviceRadixSortHistogram< + RADIX_BITS, T>::Scratch scratch_; + auto scratch = (typename breeze::algorithms::DeviceRadixSortHistogram< + RADIX_BITS, T>::Scratch*)&scratch_; + + breeze::algorithms::DeviceRadixSortHistogram::template Build< + ITEMS_PER_THREAD, TILE_SIZE>( + p, breeze::utils::make_slice(in), + breeze::utils::make_slice(out), + breeze::utils::make_slice(scratch), num_items); +} + +template +__global__ void RadixSort(const T* in, const unsigned* in_offsets, + const int* start_bit, const int* num_pass_bits, + T* out, int* next_block_idx, unsigned* blocks, + int num_items) { + using PlatformT = CudaPlatform; + PlatformT p; + __shared__ typename breeze::algorithms::DeviceRadixSort< + PlatformT, ITEMS_PER_THREAD, RADIX_BITS, T>::Scratch scratch_; + auto scratch = + (typename breeze::algorithms::DeviceRadixSort< + PlatformT, ITEMS_PER_THREAD, RADIX_BITS, T>::Scratch*)&scratch_; + + breeze::algorithms::DeviceRadixSort:: + template Sort( + p, breeze::utils::make_slice(in), + breeze::utils::make_slice(in_offsets), + *start_bit, *num_pass_bits, + breeze::utils::make_slice(out), + breeze::utils::make_slice(next_block_idx), + breeze::utils::make_slice(blocks), + breeze::utils::make_slice(scratch), num_items); +} + +} // namespace kernels diff --git a/velox/experimental/breeze/test/generated/functions/function_test-cuda.cuh b/velox/experimental/breeze/test/generated/functions/function_test-cuda.cuh new file mode 100644 index 000000000000..ce451a90c8b7 --- /dev/null +++ b/velox/experimental/breeze/test/generated/functions/function_test-cuda.cuh @@ -0,0 +1,160 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2024 by Rivos Inc. + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * This file is auto-generated from test_fixture_generator.py + * DO NOT EDIT! + */ + +#include + +#include + +#include "breeze/platforms/cuda.cuh" +#include "test/generated/functions/kernels-cuda.cuh" +#include "test/platforms/cuda_test.cuh" + +template +class FunctionTest : public ::testing::Test { + protected: + template + void BlockLoad(const std::vector& in, std::vector& out) { + CudaTestLaunch( + /*num_blocks=*/1, + &kernels::BlockLoad, in, out, + in.size()); + } + + template + void BlockLoadIf(const std::vector& in, + const std::vector& selection_flags, + std::vector& out) { + CudaTestLaunch( + /*num_blocks=*/1, + &kernels::BlockLoadIf, in, + selection_flags, out, in.size()); + } + + template + void BlockLoadFrom(const std::vector& in, const std::vector& offsets, + std::vector& out) { + CudaTestLaunch( + /*num_blocks=*/1, + &kernels::BlockLoadFrom, in, + offsets, out, in.size()); + } + + template + void BlockStore(const std::vector& in, std::vector& out) { + CudaTestLaunch( + /*num_blocks=*/1, + &kernels::BlockStore, in, out, + out.size()); + } + + template + void BlockStoreIf(const std::vector& in, + const std::vector& selection_flags, + std::vector& out) { + CudaTestLaunch( + /*num_blocks=*/1, + &kernels::BlockStoreIf, in, + selection_flags, out, out.size()); + } + + template + void BlockStoreAt(const std::vector& in, const std::vector& offsets, + std::vector& out) { + CudaTestLaunch( + /*num_blocks=*/1, + &kernels::BlockStoreAt, in, offsets, + out, out.size()); + } + + template + void BlockStoreAtIf(const std::vector& in, const std::vector& offsets, + const std::vector& selection_flags, + std::vector& out) { + CudaTestLaunch( + /*num_blocks=*/1, + &kernels::BlockStoreAtIf, in, + offsets, selection_flags, out, out.size()); + } + + template + void BlockFill(T value, std::vector& out) { + const std::vector vec_value(1, value); + CudaTestLaunch( + /*num_blocks=*/1, + &kernels::BlockFill, vec_value, out, + out.size()); + } + + template + void BlockFillAtIf(T value, const std::vector& offsets, + const std::vector& selection_flags, + std::vector& out) { + const std::vector vec_value(1, value); + CudaTestLaunch( + /*num_blocks=*/1, + &kernels::BlockFillAtIf, vec_value, + offsets, selection_flags, out, out.size()); + } + + template + void BlockReduce(const std::vector& in, U* out) { + std::vector vec_out(1, *out); + CudaTestLaunch( + /*num_blocks=*/1, + &kernels::BlockReduce, + in, vec_out, in.size()); + *out = vec_out[0]; + } + + template + void BlockScan(const std::vector& in, std::vector& out) { + CudaTestLaunch( + /*num_blocks=*/1, + &kernels::BlockScan, in, + out, in.size()); + } + + template + void BlockRadixRank(const std::vector& in, std::vector& out) { + CudaTestLaunch( + /*num_blocks=*/1, + &kernels::BlockRadixRank, + in, out, in.size()); + } + + template + void BlockRadixSort(const std::vector& in, std::vector& out) { + CudaTestLaunch( + /*num_blocks=*/1, + &kernels::BlockRadixSort, + in, out, in.size()); + } +}; diff --git a/velox/experimental/breeze/test/generated/functions/kernels-cuda.cuh b/velox/experimental/breeze/test/generated/functions/kernels-cuda.cuh new file mode 100644 index 000000000000..3aa089397bbb --- /dev/null +++ b/velox/experimental/breeze/test/generated/functions/kernels-cuda.cuh @@ -0,0 +1,284 @@ +/* + * Copyright (c) Facebook, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* + * Copyright (c) 2024 by Rivos Inc. + * Licensed under the Apache License, Version 2.0, see LICENSE for details. + * SPDX-License-Identifier: Apache-2.0 + */ + +/* + * This file is auto-generated from kernel_generator.py + * DO NOT EDIT! + */ + +#include "breeze/functions/load.h" +#include "breeze/functions/reduce.h" +#include "breeze/functions/scan.h" +#include "breeze/functions/sort.h" +#include "breeze/functions/store.h" +#include "breeze/platforms/cuda.cuh" +#include "breeze/platforms/platform.h" + +namespace kernels { + +enum { WARP_THREADS = 32 }; +template +__global__ void BlockLoad(const T* in, T* out, int num_items) { + using PlatformT = CudaPlatform; + PlatformT p; + T items[ITEMS_PER_THREAD]; + breeze::functions::BlockLoad( + p, breeze::utils::make_slice(in), + breeze::utils::make_slice(items), num_items); + breeze::functions::BlockStore( + p, breeze::utils::make_slice(items), + breeze::utils::make_slice(out), num_items); +} + +template +__global__ void BlockLoadIf(const T* in, const int* in_selection_flags, T* out, + int num_items) { + using PlatformT = CudaPlatform; + PlatformT p; + int selection_flags[ITEMS_PER_THREAD]; + breeze::functions::BlockLoad( + p, breeze::utils::make_slice(in_selection_flags), + breeze::utils::make_slice(selection_flags), num_items); + T items[ITEMS_PER_THREAD]; + breeze::functions::BlockLoadIf( + p, breeze::utils::make_slice(in), + breeze::utils::make_slice(selection_flags), + breeze::utils::make_slice(items), num_items); + breeze::functions::BlockStoreIf( + p, breeze::utils::make_slice(items), + breeze::utils::make_slice(selection_flags), + breeze::utils::make_slice(out), num_items); +} + +template +__global__ void BlockLoadFrom(const T* in, const int* in_offsets, T* out, + int num_items) { + using PlatformT = CudaPlatform; + PlatformT p; + int offsets[ITEMS_PER_THREAD]; + breeze::functions::BlockLoad( + p, breeze::utils::make_slice(in_offsets), + breeze::utils::make_slice(offsets), num_items); + T items[ITEMS_PER_THREAD]; + breeze::functions::BlockLoadFrom( + p, breeze::utils::make_slice(in), + breeze::utils::make_slice(offsets), breeze::utils::make_slice(items), + num_items); + breeze::functions::BlockStore( + p, breeze::utils::make_slice(items), + breeze::utils::make_slice(out), num_items); +} + +template +__global__ void BlockStore(const T* in, T* out, int num_items) { + using PlatformT = CudaPlatform; + PlatformT p; + breeze::functions::BlockStore( + p, + breeze::utils::make_slice( + in), + breeze::utils::make_slice(out), num_items); +} + +template +__global__ void BlockStoreIf(const T* in, const int* selection_flags, T* out, + int num_items) { + using PlatformT = CudaPlatform; + PlatformT p; + breeze::functions::BlockStoreIf( + p, + breeze::utils::make_slice( + in), + breeze::utils::make_slice( + selection_flags), + breeze::utils::make_slice(out), num_items); +} + +template +__global__ void BlockStoreAt(const T* in, const int* offsets, T* out, + int num_items) { + using PlatformT = CudaPlatform; + PlatformT p; + breeze::functions::BlockStoreAt( + p, + breeze::utils::make_slice( + in), + breeze::utils::make_slice( + offsets), + breeze::utils::make_slice(out), num_items); +} + +template +__global__ void BlockStoreAtIf(const T* in, const int* offsets, + const int* selection_flags, T* out, + int num_items) { + using PlatformT = CudaPlatform; + PlatformT p; + breeze::functions::BlockStoreAtIf( + p, + breeze::utils::make_slice( + in), + breeze::utils::make_slice( + offsets), + breeze::utils::make_slice( + selection_flags), + breeze::utils::make_slice(out), num_items); +} + +template +__global__ void BlockFill(const T* value, T* out, int num_items) { + using PlatformT = CudaPlatform; + PlatformT p; + breeze::functions::BlockFill( + p, *value, breeze::utils::make_slice(out), + num_items); +} + +template +__global__ void BlockFillAtIf(const T* value, const int* offsets, + const int* selection_flags, T* out, + int num_items) { + using PlatformT = CudaPlatform; + PlatformT p; + breeze::functions::BlockFillAtIf( + p, *value, + breeze::utils::make_slice( + offsets), + breeze::utils::make_slice( + selection_flags), + breeze::utils::make_slice(out), num_items); +} + +template +__global__ void BlockReduce(const T* in, U* out, int num_items) { + using PlatformT = CudaPlatform; + PlatformT p; + __shared__ + typename breeze::functions::BlockReduce::Scratch scratch_; + auto scratch = + (typename breeze::functions::BlockReduce::Scratch*)&scratch_; + + T items[ITEMS_PER_THREAD]; + breeze::functions::BlockLoad( + p, breeze::utils::make_slice(in), + breeze::utils::make_slice(items), num_items); + U aggregate = breeze::functions::BlockReduce::template Reduce< + Op, ITEMS_PER_THREAD>( + p, breeze::utils::make_slice(items), + breeze::utils::make_slice(scratch), num_items); + p.syncthreads(); + if (p.thread_idx() == 0) { + *out = aggregate; + } +} + +template +__global__ void BlockScan(const T* in, U* out, int num_items) { + using PlatformT = CudaPlatform; + PlatformT p; + __shared__ + typename breeze::functions::BlockScan::Scratch scratch_; + auto scratch = (typename breeze::functions::BlockScan< + PlatformT, U, ITEMS_PER_THREAD>::Scratch*)&scratch_; + + T items[ITEMS_PER_THREAD]; + breeze::functions::BlockLoad( + p, breeze::utils::make_slice(in), + breeze::utils::make_slice(items), num_items); + U sums[ITEMS_PER_THREAD]; + breeze::functions::BlockScan::template Scan< + Op>(p, breeze::utils::make_slice(items), breeze::utils::make_slice(sums), + breeze::utils::make_slice(scratch), num_items); + breeze::functions::BlockStore( + p, breeze::utils::make_slice(sums), + breeze::utils::make_slice(out), num_items); +} + +template +__global__ void BlockRadixRank(const T* in, int* out, int num_items) { + using PlatformT = CudaPlatform; + PlatformT p; + __shared__ + typename breeze::functions::BlockRadixRank::Scratch scratch_; + auto scratch = (typename breeze::functions::BlockRadixRank< + PlatformT, ITEMS_PER_THREAD, RADIX_BITS>::Scratch*)&scratch_; + + T items[ITEMS_PER_THREAD]; + // initialize invalid items to max value + for (int i = 0; i < ITEMS_PER_THREAD; ++i) { + items[i] = static_cast((1 << RADIX_BITS) - 1); + } + breeze::functions::BlockLoad( + p, breeze::utils::make_slice(in), + breeze::utils::make_slice(items), + num_items); + int ranks[ITEMS_PER_THREAD]; + breeze::functions::BlockRadixRank:: + Rank(p, + breeze::utils::make_slice(items), + breeze::utils::make_slice(ranks), + breeze::utils::make_slice(scratch)); + breeze::functions::BlockStore( + p, + breeze::utils::make_slice(ranks), + breeze::utils::make_slice(out), num_items); +} + +template +__global__ void BlockRadixSort(const T* in, T* out, int num_items) { + using PlatformT = CudaPlatform; + PlatformT p; + __shared__ typename breeze::functions::BlockRadixSort< + PlatformT, ITEMS_PER_THREAD, RADIX_BITS, T>::Scratch scratch_; + auto scratch = + (typename breeze::functions::BlockRadixSort< + PlatformT, ITEMS_PER_THREAD, RADIX_BITS, T>::Scratch*)&scratch_; + + T items[ITEMS_PER_THREAD]; + breeze::functions::BlockLoad( + p, breeze::utils::make_slice(in), + breeze::utils::make_slice(items), + num_items); + breeze::functions:: + BlockRadixSort::Sort( + p, + breeze::utils::make_slice(items), + breeze::utils::make_slice(scratch), num_items); + breeze::functions::BlockStore( + p, + breeze::utils::make_slice(items), + breeze::utils::make_slice(out), num_items); +} + +} // namespace kernels diff --git a/velox/experimental/breeze/test/test_fixture_generator.py b/velox/experimental/breeze/test/test_fixture_generator.py index 36425fac93d9..cbb6d7d19e81 100755 --- a/velox/experimental/breeze/test/test_fixture_generator.py +++ b/velox/experimental/breeze/test/test_fixture_generator.py @@ -275,7 +275,7 @@ def __init__(self): def includes(self, fixture_name): test_type = fixture_name.replace("Test", "").lower() return f""" - #include "test/generated/{test_type}s/kernels.cuh" + #include "test/generated/{test_type}s/kernels-cuda.cuh" #include "breeze/platforms/cuda.cuh" #include "test/platforms/cuda_test.cuh" """