diff --git a/include/matx/generators/random.h b/include/matx/generators/random.h index e97833c4..ce526b67 100644 --- a/include/matx/generators/random.h +++ b/include/matx/generators/random.h @@ -448,7 +448,7 @@ template class randomTensorView_t { } else if constexpr (is_single_thread_host_executor_v) { if (!init_) { - curandStatus_t ret; + [[maybe_unused]] curandStatus_t ret; ret = curandCreateGeneratorHost(&gen_, CURAND_RNG_PSEUDO_MT19937); MATX_ASSERT_STR_EXP(ret, CURAND_STATUS_SUCCESS, matxCudaError, "Failed to create random number generator"); diff --git a/include/matx/transforms/inverse.h b/include/matx/transforms/inverse.h index 7eb34107..5532a7ab 100644 --- a/include/matx/transforms/inverse.h +++ b/include/matx/transforms/inverse.h @@ -97,11 +97,13 @@ class matxInversePlan_t { * Inverse of A (if it exists) * */ - matxInversePlan_t(TensorTypeAInv &a_inv, const TensorTypeA &a) + matxInversePlan_t(TensorTypeAInv &a_inv, const TensorTypeA &a, cudaStream_t stream) { static_assert(RANK >= 2); MATX_NVTX_START("", matx::MATX_NVTX_LOG_INTERNAL) + + stream_ = stream; // Ok to remove since we're just passing a list of RO pointers //using a_nc = typename std::remove_const(a); @@ -123,26 +125,29 @@ class matxInversePlan_t { // here as our batch dims std::vector in_pointers; std::vector out_pointers; + make_tensor(tmp_a_, a.Shape(), MATX_ASYNC_DEVICE_MEMORY, stream); + (tmp_a_ = a).run(stream); + if constexpr (RANK == 2) { - in_pointers.push_back(&a(0, 0)); + in_pointers.push_back(&tmp_a_(0, 0)); out_pointers.push_back(&a_inv(0, 0)); } else { using shape_type = typename TensorTypeA::desc_type::shape_type; int batch_offset = 2; std::array idx{0}; - auto a_shape = a.Shape(); + auto a_shape = tmp_a_.Shape(); // Get total number of batches size_t total_iter = std::accumulate(a_shape.begin(), a_shape.begin() + TensorTypeA::Rank() - batch_offset, 1, std::multiplies()); for (size_t iter = 0; iter < total_iter; iter++) { - auto ip = std::apply([&a](auto... param) { return a.GetPointer(param...); }, idx); + auto ip = std::apply([&](auto... param) { return tmp_a_.GetPointer(param...); }, idx); auto op = std::apply([&a_inv](auto... param) { return a_inv.GetPointer(param...); }, idx); in_pointers.push_back(ip); out_pointers.push_back(op); // Update all but the last 2 indices - UpdateIndices(a, idx, batch_offset); + UpdateIndices(tmp_a_, idx, batch_offset); } } @@ -307,6 +312,8 @@ class matxInversePlan_t { int *d_info; T1 **d_A_array; T1 **d_A_inv_array; + cudaStream_t stream_; + matx::tensor_t tmp_a_; }; /** @@ -367,7 +374,7 @@ void inv_impl(TensorTypeAInv &a_inv, const TensorTypeA &a, // Get cache or new inverse plan if it doesn't exist auto ret = detail::inv_cache.Lookup(params); if (ret == std::nullopt) { - auto tmp = new detail::matxInversePlan_t{a_inv, a}; + auto tmp = new detail::matxInversePlan_t{a_inv, a, stream}; detail::inv_cache.Insert(params, static_cast(tmp)); tmp->Exec(stream); } diff --git a/include/matx/transforms/resample_poly.h b/include/matx/transforms/resample_poly.h index 38fa477e..50517395 100644 --- a/include/matx/transforms/resample_poly.h +++ b/include/matx/transforms/resample_poly.h @@ -126,7 +126,7 @@ inline void resample_poly_impl(OutType &out, const InType &in, const FilterType } const index_t up_size = in.Size(RANK-1) * up; - const index_t outlen = up_size / down + ((up_size % down) ? 1 : 0); + [[maybe_unused]] const index_t outlen = up_size / down + ((up_size % down) ? 1 : 0); MATX_ASSERT_STR(out.Size(RANK-1) == outlen, matxInvalidDim, "resample_poly: output size mismatch");