From 7ab7ce7d556191d6434b87156bb9af4bf5c1434a Mon Sep 17 00:00:00 2001 From: Lu Fang Date: Fri, 3 Jan 2025 17:45:09 -0800 Subject: [PATCH] Replace c10::optional with std::optional Signed-off-by: Lu Fang --- csrc/attention/paged_attention_v1.cu | 4 +- csrc/attention/paged_attention_v2.cu | 4 +- csrc/cpu/attention.cpp | 8 ++-- csrc/cpu/quant.cpp | 10 ++-- csrc/cpu/torch_bindings.cpp | 6 +-- .../epilogue/scaled_mm_epilogues_c2x.hpp | 6 +-- .../epilogue/scaled_mm_epilogues_c3x.hpp | 6 +-- csrc/cutlass_extensions/torch_utils.hpp | 2 +- csrc/mamba/causal_conv1d/causal_conv1d.cu | 24 +++++----- csrc/mamba/mamba_ssm/selective_scan_fwd.cu | 22 ++++----- csrc/ops.h | 46 +++++++++---------- .../compressed_tensors/int8_quant_kernels.cu | 4 +- .../cutlass_w8a8/scaled_mm_c2x.cu | 18 ++++---- .../cutlass_w8a8/scaled_mm_c3x.cu | 6 +-- .../cutlass_w8a8/scaled_mm_entry.cu | 30 ++++++------ csrc/quantization/machete/generate.py | 2 +- .../machete/machete_mm_kernel.cuh | 10 ++-- .../machete/machete_mm_launcher.cuh | 24 +++++----- .../machete/machete_prepack_launcher.cuh | 2 +- csrc/quantization/machete/machete_pytorch.cu | 26 +++++------ csrc/rocm/attention.cu | 4 +- csrc/rocm/ops.h | 2 +- csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu | 2 +- csrc/sparse/cutlass/sparse_scaled_mm_entry.cu | 4 +- 24 files changed, 136 insertions(+), 136 deletions(-) diff --git a/csrc/attention/paged_attention_v1.cu b/csrc/attention/paged_attention_v1.cu index cb1a069942069..27321148f6dda 100644 --- a/csrc/attention/paged_attention_v1.cu +++ b/csrc/attention/paged_attention_v1.cu @@ -53,7 +53,7 @@ void paged_attention_v1_launcher( torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, int num_kv_heads, float scale, torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len, - const c10::optional& alibi_slopes, float k_scale, + const std::optional& alibi_slopes, float k_scale, float v_scale, const int tp_rank, const int blocksparse_local_blocks, const int blocksparse_vert_stride, const int blocksparse_block_size, const int blocksparse_head_sliding_step) { @@ -176,7 +176,7 @@ void paged_attention_v1( torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq] torch::Tensor& seq_lens, // [num_seqs] int64_t block_size, int64_t max_seq_len, - const c10::optional& alibi_slopes, + const std::optional& alibi_slopes, const std::string& kv_cache_dtype, double k_scale, double v_scale, const int64_t tp_rank, const int64_t blocksparse_local_blocks, const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, diff --git a/csrc/attention/paged_attention_v2.cu b/csrc/attention/paged_attention_v2.cu index c457bdb89008e..a453b2243e48c 100644 --- a/csrc/attention/paged_attention_v2.cu +++ b/csrc/attention/paged_attention_v2.cu @@ -54,7 +54,7 @@ void paged_attention_v2_launcher( torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, int num_kv_heads, float scale, torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len, - const c10::optional& alibi_slopes, float k_scale, + const std::optional& alibi_slopes, float k_scale, float v_scale, const int tp_rank, const int blocksparse_local_blocks, const int blocksparse_vert_stride, const int blocksparse_block_size, const int blocksparse_head_sliding_step) { @@ -187,7 +187,7 @@ void paged_attention_v2( torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq] torch::Tensor& seq_lens, // [num_seqs] int64_t block_size, int64_t max_seq_len, - const c10::optional& alibi_slopes, + const std::optional& alibi_slopes, const std::string& kv_cache_dtype, double k_scale, double v_scale, const int64_t tp_rank, const int64_t blocksparse_local_blocks, const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, diff --git a/csrc/cpu/attention.cpp b/csrc/cpu/attention.cpp index e21832ba7582f..ef5b14088c63b 100644 --- a/csrc/cpu/attention.cpp +++ b/csrc/cpu/attention.cpp @@ -386,7 +386,7 @@ void paged_attention_v1_impl_launcher( torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, int num_kv_heads, float scale, torch::Tensor& block_tables, torch::Tensor& seq_lens, int max_seq_len, - const c10::optional& alibi_slopes) { + const std::optional& alibi_slopes) { int num_seqs = query.size(0); int num_heads = query.size(1); int head_size = query.size(2); @@ -459,7 +459,7 @@ void paged_attention_v1( torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, int64_t num_kv_heads, double scale, torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size, - int64_t max_seq_len, const c10::optional& alibi_slopes, + int64_t max_seq_len, const std::optional& alibi_slopes, const std::string& kv_cache_dtype, double k_scale, double v_scale, const int64_t tp_rank, const int64_t blocksparse_local_blocks, const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, @@ -702,7 +702,7 @@ void paged_attention_v2_impl_launcher( torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, int num_kv_heads, float scale, torch::Tensor& block_tables, torch::Tensor& seq_lens, int block_size, - int max_seq_len, const c10::optional& alibi_slopes) { + int max_seq_len, const std::optional& alibi_slopes) { int num_seqs = query.size(0); int num_heads = query.size(1); int head_size = query.size(2); @@ -781,7 +781,7 @@ void paged_attention_v2( torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, int64_t num_kv_heads, double scale, torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size, - int64_t max_seq_len, const c10::optional& alibi_slopes, + int64_t max_seq_len, const std::optional& alibi_slopes, const std::string& kv_cache_dtype, double k_scale, double v_scale, const int64_t tp_rank, const int64_t blocksparse_local_blocks, const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, diff --git a/csrc/cpu/quant.cpp b/csrc/cpu/quant.cpp index d9aed657a3113..33b1637832888 100644 --- a/csrc/cpu/quant.cpp +++ b/csrc/cpu/quant.cpp @@ -359,7 +359,7 @@ void int8_scaled_mm(torch::Tensor& c, // [M, OC], row-major const torch::Tensor& b, // [IC, OC], column-major const torch::Tensor& a_scales, // [1] or [M] const torch::Tensor& b_scales, // [1] or [OC] - const c10::optional& bias // [OC] + const std::optional& bias // [OC] ) { CPU_KERNEL_GUARD_IN(cutlass_scaled_mm) // Checks for conformality @@ -442,8 +442,8 @@ void int8_scaled_mm_azp(torch::Tensor& c, // [M, OC], row-major const torch::Tensor& a_scales, // [1] or [M] const torch::Tensor& b_scales, // [1] or [OC] const torch::Tensor& azp_adj, // [OC] - const c10::optional& azp, // [1] or [M] - const c10::optional& bias // [OC] + const std::optional& azp, // [1] or [M] + const std::optional& bias // [OC] ) { CPU_KERNEL_GUARD_IN(cutlass_scaled_mm_azp) // Checks for conformality @@ -561,7 +561,7 @@ void int8_scaled_mm_azp(torch::Tensor& c, // [M, OC], row-major void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size] const torch::Tensor& input, // [..., hidden_size] const torch::Tensor& scale, - c10::optional const& azp) { + std::optional const& azp) { CPU_KERNEL_GUARD_IN(static_scaled_int8_quant) TORCH_CHECK(input.is_contiguous()); TORCH_CHECK(out.is_contiguous()); @@ -590,7 +590,7 @@ void dynamic_scaled_int8_quant( torch::Tensor& out, // [..., hidden_size] const torch::Tensor& input, // [..., hidden_size] torch::Tensor& scale, // [..., 1] - c10::optional const& azp) { + std::optional const& azp) { CPU_KERNEL_GUARD_IN(dynamic_scaled_int8_quant) TORCH_CHECK(input.is_contiguous()); TORCH_CHECK(out.is_contiguous()); diff --git a/csrc/cpu/torch_bindings.cpp b/csrc/cpu/torch_bindings.cpp index 03beefbc6de7d..74e4d8189d403 100644 --- a/csrc/cpu/torch_bindings.cpp +++ b/csrc/cpu/torch_bindings.cpp @@ -9,14 +9,14 @@ std::string init_cpu_threads_env(const std::string& cpu_ids); void int8_scaled_mm(torch::Tensor& c, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& a_scales, const torch::Tensor& b_scales, - const c10::optional& bias); + const std::optional& bias); void int8_scaled_mm_azp(torch::Tensor& c, const torch::Tensor& a, const torch::Tensor& b, const torch::Tensor& a_scales, const torch::Tensor& b_scales, const torch::Tensor& azp_adj, - const c10::optional& azp, - const c10::optional& bias); + const std::optional& azp, + const std::optional& bias); TORCH_LIBRARY_EXPAND(TORCH_EXTENSION_NAME, ops) { // vLLM custom ops diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp index 26f7423fd7455..ef413e6dd75c5 100644 --- a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp +++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c2x.hpp @@ -68,7 +68,7 @@ struct ScaledEpilogueBase { // This overload handles the case where there might not be a tensor, in which // case a nullptr is passed and a constant (0) is used. template - static auto args_from_tensor(c10::optional const& tensor) { + static auto args_from_tensor(std::optional const& tensor) { static_assert(std::is_same_v>); using Arguments = typename Descriptor::Arguments; auto* data_ptr = tensor ? static_cast(tensor->data_ptr()) : nullptr; @@ -223,7 +223,7 @@ struct ScaledEpilogueBiasAzp static ArgumentType prepare_args(torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& bias) { + std::optional const& bias) { auto a_args = SUPER::template args_from_tensor(a_scales); auto b_args = SUPER::template args_from_tensor(b_scales); auto bias_args = SUPER::template args_from_tensor(bias); @@ -301,7 +301,7 @@ struct ScaledEpilogueBiasAzpToken torch::Tensor const& b_scales, torch::Tensor const& azp_adj, torch::Tensor const& azp, - c10::optional const& bias) { + std::optional const& bias) { auto a_args = SUPER::template args_from_tensor(a_scales); auto b_args = SUPER::template args_from_tensor(b_scales); auto bias_args = SUPER::template args_from_tensor(bias); diff --git a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp index c723adf126422..c590c66a66652 100644 --- a/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp +++ b/csrc/cutlass_extensions/epilogue/scaled_mm_epilogues_c3x.hpp @@ -67,7 +67,7 @@ struct ScaledEpilogueBase { // This overload handles the case where there might not be a tensor, in which // case a nullptr is passed and a constant (0) is used. template - static auto args_from_tensor(c10::optional const& tensor) { + static auto args_from_tensor(std::optional const& tensor) { using Arguments = typename Descriptor::Arguments; auto* data_ptr = tensor ? static_cast(tensor->data_ptr()) : nullptr; static_assert(std::is_same_v> || @@ -223,7 +223,7 @@ struct ScaledEpilogueBiasAzp static ArgumentType prepare_args(torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& bias) { + std::optional const& bias) { auto a_args = SUPER::template args_from_tensor(a_scales); auto b_args = SUPER::template args_from_tensor(b_scales); auto bias_args = SUPER::template args_from_tensor(bias); @@ -299,7 +299,7 @@ struct ScaledEpilogueBiasAzpToken torch::Tensor const& b_scales, torch::Tensor const& azp_adj, torch::Tensor const& azp, - c10::optional const& bias) { + std::optional const& bias) { auto a_args = SUPER::template args_from_tensor(a_scales); auto b_args = SUPER::template args_from_tensor(b_scales); auto bias_args = SUPER::template args_from_tensor(bias); diff --git a/csrc/cutlass_extensions/torch_utils.hpp b/csrc/cutlass_extensions/torch_utils.hpp index 2c78572521eec..a1ff933cce63f 100644 --- a/csrc/cutlass_extensions/torch_utils.hpp +++ b/csrc/cutlass_extensions/torch_utils.hpp @@ -97,7 +97,7 @@ static inline auto make_cute_layout(torch::Tensor const& tensor, template static inline auto maybe_make_cute_layout( - c10::optional const& tensor, + std::optional const& tensor, std::string_view name = "tensor") { using Layout = decltype(make_cute_layout(*tensor)); diff --git a/csrc/mamba/causal_conv1d/causal_conv1d.cu b/csrc/mamba/causal_conv1d/causal_conv1d.cu index dd1e6de2e0180..f0e5533bcae60 100644 --- a/csrc/mamba/causal_conv1d/causal_conv1d.cu +++ b/csrc/mamba/causal_conv1d/causal_conv1d.cu @@ -53,12 +53,12 @@ void set_conv_params_fwd(ConvParamsBase ¶ms, const at::Tensor x, const at::Tensor weight, const at::Tensor out, - const c10::optional& bias, + const std::optional& bias, bool silu_activation, int64_t pad_slot_id, - const c10::optional& query_start_loc = std::nullopt, - const c10::optional& cache_indices = std::nullopt, - const c10::optional& has_initial_state = std::nullopt) { + const std::optional& query_start_loc = std::nullopt, + const std::optional& cache_indices = std::nullopt, + const std::optional& has_initial_state = std::nullopt) { // Reset the parameters memset(¶ms, 0, sizeof(params)); @@ -93,11 +93,11 @@ void set_conv_params_fwd(ConvParamsBase ¶ms, void causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight, - const c10::optional &bias_, - const c10::optional &conv_states, - const c10::optional &query_start_loc, - const c10::optional &cache_indices, - const c10::optional &has_initial_state, + const std::optional &bias_, + const std::optional &conv_states, + const std::optional &query_start_loc, + const std::optional &cache_indices, + const std::optional &has_initial_state, bool silu_activation, // used to identify padding entries if cache_indices provided // in case of padding, the kernel will return early @@ -194,10 +194,10 @@ void causal_conv1d_fwd(const at::Tensor &x, const at::Tensor &weight, void causal_conv1d_update(const at::Tensor &x, const at::Tensor &conv_state, const at::Tensor &weight, - const c10::optional &bias_, + const std::optional &bias_, bool silu_activation, - const c10::optional &cache_seqlens_, - const c10::optional &conv_state_indices_, + const std::optional &cache_seqlens_, + const std::optional &conv_state_indices_, // used to identify padding entries if cache_indices provided // in case of padding, the kernel will return early int64_t pad_slot_id) { diff --git a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu index 71624696338d0..bd0a34119c82b 100644 --- a/csrc/mamba/mamba_ssm/selective_scan_fwd.cu +++ b/csrc/mamba/mamba_ssm/selective_scan_fwd.cu @@ -402,14 +402,14 @@ void set_ssm_params_fwd(SSMParamsBase ¶ms, const torch::Tensor out, const torch::Tensor z, const torch::Tensor out_z, - const c10::optional& D, - const c10::optional& delta_bias, + const std::optional& D, + const std::optional& delta_bias, const torch::Tensor ssm_states, bool has_z, bool delta_softplus, - const c10::optional& query_start_loc, - const c10::optional& cache_indices, - const c10::optional& has_initial_state, + const std::optional& query_start_loc, + const std::optional& cache_indices, + const std::optional& has_initial_state, bool varlen, int64_t pad_slot_id) { @@ -504,13 +504,13 @@ void set_ssm_params_fwd(SSMParamsBase ¶ms, void selective_scan_fwd(const torch::Tensor &u, const torch::Tensor &delta, const torch::Tensor &A, const torch::Tensor &B, const torch::Tensor &C, - const c10::optional &D_, - const c10::optional &z_, - const c10::optional &delta_bias_, + const std::optional &D_, + const std::optional &z_, + const std::optional &delta_bias_, bool delta_softplus, - const c10::optional &query_start_loc, - const c10::optional &cache_indices, - const c10::optional &has_initial_state, + const std::optional &query_start_loc, + const std::optional &cache_indices, + const std::optional &has_initial_state, const torch::Tensor &ssm_states, // used to identify padding entries if cache_indices provided // in case of padding, the kernel will return early diff --git a/csrc/ops.h b/csrc/ops.h index 347c502845d8f..9efd9b0c24700 100644 --- a/csrc/ops.h +++ b/csrc/ops.h @@ -33,7 +33,7 @@ void paged_attention_v1( torch::Tensor& out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, int64_t num_kv_heads, double scale, torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size, - int64_t max_seq_len, const c10::optional& alibi_slopes, + int64_t max_seq_len, const std::optional& alibi_slopes, const std::string& kv_cache_dtype, double k_scale, double v_scale, const int64_t tp_rank, const int64_t blocksparse_local_blocks, const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, @@ -44,7 +44,7 @@ void paged_attention_v2( torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, int64_t num_kv_heads, double scale, torch::Tensor& block_tables, torch::Tensor& seq_lens, int64_t block_size, - int64_t max_seq_len, const c10::optional& alibi_slopes, + int64_t max_seq_len, const std::optional& alibi_slopes, const std::string& kv_cache_dtype, double k_scale, double v_scale, const int64_t tp_rank, const int64_t blocksparse_local_blocks, const int64_t blocksparse_vert_stride, const int64_t blocksparse_block_size, @@ -153,15 +153,15 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability); void cutlass_scaled_mm(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias); + std::optional const& bias); void cutlass_scaled_mm_azp(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& azp, - c10::optional const& bias); + std::optional const& azp, + std::optional const& bias); bool cutlass_sparse_scaled_mm_supported(int64_t cuda_device_capability); @@ -169,7 +169,7 @@ void cutlass_scaled_sparse_mm(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& e, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias); + std::optional const& bias); bool cutlass_sparse_compress_entry(torch::Tensor& a_compressed, torch::Tensor& e, torch::Tensor const& a); @@ -177,11 +177,11 @@ bool cutlass_sparse_compress_entry(torch::Tensor& a_compressed, void static_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input, torch::Tensor const& scale, - c10::optional const& azp); + std::optional const& azp); void dynamic_scaled_int8_quant(torch::Tensor& out, torch::Tensor const& input, torch::Tensor& scales, - c10::optional const& azp); + std::optional const& azp); torch::Tensor gptq_gemm(torch::Tensor a, torch::Tensor b_q_weight, torch::Tensor b_gptq_qzeros, @@ -198,34 +198,34 @@ void dynamic_scaled_fp8_quant(torch::Tensor& out, torch::Tensor const& input, void dynamic_per_token_scaled_fp8_quant( torch::Tensor& out, torch::Tensor const& input, torch::Tensor& scale, - c10::optional const& scale_ub); + std::optional const& scale_ub); void selective_scan_fwd(const torch::Tensor& u, const torch::Tensor& delta, const torch::Tensor& A, const torch::Tensor& B, const torch::Tensor& C, - const c10::optional& D_, - const c10::optional& z_, - const c10::optional& delta_bias_, + const std::optional& D_, + const std::optional& z_, + const std::optional& delta_bias_, bool delta_softplus, - const c10::optional& query_start_loc, - const c10::optional& cache_indices, - const c10::optional& has_initial_state, + const std::optional& query_start_loc, + const std::optional& cache_indices, + const std::optional& has_initial_state, const torch::Tensor& ssm_states, int64_t pad_slot_id); void causal_conv1d_update(const at::Tensor& x, const at::Tensor& conv_state, const at::Tensor& weight, - const c10::optional& bias_, + const std::optional& bias_, bool silu_activation, - const c10::optional& cache_seqlens_, - const c10::optional& conv_state_indices_, + const std::optional& cache_seqlens_, + const std::optional& conv_state_indices_, int64_t pad_slot_id); void causal_conv1d_fwd(const at::Tensor& x, const at::Tensor& weight, - const c10::optional& bias_, - const c10::optional& conv_states, - const c10::optional& query_start_loc, - const c10::optional& cache_indices, - const c10::optional& has_initial_state, + const std::optional& bias_, + const std::optional& conv_states, + const std::optional& query_start_loc, + const std::optional& cache_indices, + const std::optional& has_initial_state, bool silu_activation, int64_t pad_slot_id); #ifndef USE_ROCM diff --git a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu index e9987535bd3ea..e79785827189d 100644 --- a/csrc/quantization/compressed_tensors/int8_quant_kernels.cu +++ b/csrc/quantization/compressed_tensors/int8_quant_kernels.cu @@ -226,7 +226,7 @@ __global__ void dynamic_scaled_int8_azp_quant_kernel( void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size] torch::Tensor const& input, // [..., hidden_size] torch::Tensor const& scale, - c10::optional const& azp) { + std::optional const& azp) { TORCH_CHECK(input.is_contiguous()); TORCH_CHECK(out.is_contiguous()); TORCH_CHECK(scale.numel() == 1); @@ -257,7 +257,7 @@ void static_scaled_int8_quant(torch::Tensor& out, // [..., hidden_size] void dynamic_scaled_int8_quant( torch::Tensor& out, // [..., hidden_size] torch::Tensor const& input, // [..., hidden_size] - torch::Tensor& scales, c10::optional const& azp) { + torch::Tensor& scales, std::optional const& azp) { TORCH_CHECK(input.is_contiguous()); TORCH_CHECK(out.is_contiguous()); TORCH_CHECK(scales.is_contiguous()); diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu index dbb72e8bbd3f5..865fef5aeea11 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c2x.cu @@ -39,7 +39,7 @@ void cutlass_scaled_mm_sm75(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias) { + std::optional const& bias) { TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); if (bias) { @@ -58,8 +58,8 @@ void cutlass_scaled_mm_azp_sm75(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& azp, - c10::optional const& bias) { + std::optional const& azp, + std::optional const& bias) { TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); @@ -94,7 +94,7 @@ void cutlass_scaled_mm_sm80(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias) { + std::optional const& bias) { TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); if (bias) { @@ -113,8 +113,8 @@ void cutlass_scaled_mm_azp_sm80(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& azp, - c10::optional const& bias) { + std::optional const& azp, + std::optional const& bias) { TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); @@ -165,7 +165,7 @@ void cutlass_scaled_mm_sm89(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias) { + std::optional const& bias) { TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); if (bias) { @@ -184,8 +184,8 @@ void cutlass_scaled_mm_azp_sm89(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& azp, - c10::optional const& bias) { + std::optional const& azp, + std::optional const& bias) { TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu index 123f4359c0d1a..e18d7d79e5b77 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_c3x.cu @@ -51,7 +51,7 @@ void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias) { + std::optional const& bias) { TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); if (bias) { @@ -70,8 +70,8 @@ void cutlass_scaled_mm_azp_sm90(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& azp, - c10::optional const& bias) { + std::optional const& azp, + std::optional const& bias) { TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); diff --git a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu index 4f7b6588ef3f7..3f2b52624f366 100644 --- a/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu +++ b/csrc/quantization/cutlass_w8a8/scaled_mm_entry.cu @@ -9,26 +9,26 @@ void cutlass_scaled_mm_sm75(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias); + std::optional const& bias); void cutlass_scaled_mm_sm80(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias); + std::optional const& bias); void cutlass_scaled_mm_sm89(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias); + std::optional const& bias); #if defined ENABLE_SCALED_MM_C3X && ENABLE_SCALED_MM_C3X void cutlass_scaled_mm_sm90(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias); + std::optional const& bias); #endif void cutlass_scaled_mm_azp_sm75(torch::Tensor& c, torch::Tensor const& a, @@ -36,24 +36,24 @@ void cutlass_scaled_mm_azp_sm75(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& azp, - c10::optional const& bias); + std::optional const& azp, + std::optional const& bias); void cutlass_scaled_mm_azp_sm80(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& azp, - c10::optional const& bias); + std::optional const& azp, + std::optional const& bias); void cutlass_scaled_mm_azp_sm89(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& azp, - c10::optional const& bias); + std::optional const& azp, + std::optional const& bias); #if defined CUDA_VERSION && CUDA_VERSION >= 12000 void cutlass_scaled_mm_azp_sm90(torch::Tensor& c, torch::Tensor const& a, @@ -61,8 +61,8 @@ void cutlass_scaled_mm_azp_sm90(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& azp, - c10::optional const& bias); + std::optional const& azp, + std::optional const& bias); #endif bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) { @@ -84,7 +84,7 @@ bool cutlass_scaled_mm_supports_fp8(int64_t cuda_device_capability) { void cutlass_scaled_mm(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& b, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias) { + std::optional const& bias) { // Checks for conformality TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2); TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) && @@ -148,8 +148,8 @@ void cutlass_scaled_mm_azp(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& a_scales, torch::Tensor const& b_scales, torch::Tensor const& azp_adj, - c10::optional const& azp, - c10::optional const& bias) { + std::optional const& azp, + std::optional const& bias) { // Checks for conformality TORCH_CHECK(a.dim() == 2 && b.dim() == 2 && c.dim() == 2); TORCH_CHECK(c.size(0) == a.size(0) && a.size(1) == b.size(0) && diff --git a/csrc/quantization/machete/generate.py b/csrc/quantization/machete/generate.py index 2df4d181902f8..a9b5ddf4cbdd2 100644 --- a/csrc/quantization/machete/generate.py +++ b/csrc/quantization/machete/generate.py @@ -63,7 +63,7 @@ static inline std::optional maybe_scalartype( - c10::optional const& t) { + std::optional const& t) { if (!t) { return std::nullopt; } else { diff --git a/csrc/quantization/machete/machete_mm_kernel.cuh b/csrc/quantization/machete/machete_mm_kernel.cuh index d4d19ae5deec7..e4af067915e0a 100644 --- a/csrc/quantization/machete/machete_mm_kernel.cuh +++ b/csrc/quantization/machete/machete_mm_kernel.cuh @@ -183,11 +183,11 @@ struct MacheteKernelTemplate { torch::Tensor const& A, // MxK matrix torch::Tensor const& B, // KxN prepacked matrix torch::Tensor& D, // MxN matrix - c10::optional const& maybe_g_scales, // scale_KxN matrix - c10::optional const& maybe_g_zeros, // scale_KxN matrix - c10::optional maybe_group_size, - c10::optional const& maybe_ch_scales, // len N vector - c10::optional const& maybe_tok_scales) // len M vector + std::optional const& maybe_g_scales, // scale_KxN matrix + std::optional const& maybe_g_zeros, // scale_KxN matrix + std::optional maybe_group_size, + std::optional const& maybe_ch_scales, // len N vector + std::optional const& maybe_tok_scales) // len M vector { static_assert(!with_group_zeropoints || with_group_scales); diff --git a/csrc/quantization/machete/machete_mm_launcher.cuh b/csrc/quantization/machete/machete_mm_launcher.cuh index 4b0da5b303e0c..cabe0af46f069 100644 --- a/csrc/quantization/machete/machete_mm_launcher.cuh +++ b/csrc/quantization/machete/machete_mm_launcher.cuh @@ -13,23 +13,23 @@ struct MMArgs { torch::Tensor const& A; torch::Tensor const& B; vllm::ScalarType const& b_type; - c10::optional const& maybe_out_type; - c10::optional const& maybe_group_scales; - c10::optional const& maybe_group_zeros; - c10::optional maybe_group_size; - c10::optional const& maybe_channel_scales; - c10::optional const& maybe_token_scales; - c10::optional maybe_schedule; + std::optional const& maybe_out_type; + std::optional const& maybe_group_scales; + std::optional const& maybe_group_zeros; + std::optional maybe_group_size; + std::optional const& maybe_channel_scales; + std::optional const& maybe_token_scales; + std::optional maybe_schedule; }; struct SupportedSchedulesArgs { at::ScalarType a_type; vllm::ScalarType b_type; - c10::optional maybe_group_scales_type; - c10::optional maybe_group_zeros_type; - c10::optional maybe_channel_scales_type; - c10::optional maybe_token_scales_type; - c10::optional maybe_out_type; + std::optional maybe_group_scales_type; + std::optional maybe_group_zeros_type; + std::optional maybe_channel_scales_type; + std::optional maybe_token_scales_type; + std::optional maybe_out_type; }; torch::Tensor mm_dispatch(MMArgs args); diff --git a/csrc/quantization/machete/machete_prepack_launcher.cuh b/csrc/quantization/machete/machete_prepack_launcher.cuh index 3486d28be2126..634b651a4d107 100644 --- a/csrc/quantization/machete/machete_prepack_launcher.cuh +++ b/csrc/quantization/machete/machete_prepack_launcher.cuh @@ -10,7 +10,7 @@ struct PrepackBArgs { torch::Tensor const& B; at::ScalarType a_type; vllm::ScalarType b_type; - c10::optional maybe_group_scales_type; + std::optional maybe_group_scales_type; }; template diff --git a/csrc/quantization/machete/machete_pytorch.cu b/csrc/quantization/machete/machete_pytorch.cu index da2c2fb0d3e77..05a51ee21ddb7 100644 --- a/csrc/quantization/machete/machete_pytorch.cu +++ b/csrc/quantization/machete/machete_pytorch.cu @@ -10,11 +10,11 @@ using namespace vllm; std::vector supported_schedules( at::ScalarType a_type, int64_t b_type_id, - c10::optional maybe_group_scales_type, - c10::optional maybe_group_zeros_type, - c10::optional maybe_channel_scales_type, - c10::optional maybe_token_scales_type, - c10::optional maybe_out_type) { + std::optional maybe_group_scales_type, + std::optional maybe_group_zeros_type, + std::optional maybe_channel_scales_type, + std::optional maybe_token_scales_type, + std::optional maybe_out_type) { ScalarType const b_type = ScalarType::from_id(b_type_id); return supported_schedules_dispatch({ .a_type = a_type, @@ -29,13 +29,13 @@ std::vector supported_schedules( torch::Tensor mm(torch::Tensor const& A, torch::Tensor const& B, int64_t b_type_id, - c10::optional const& maybe_out_type, - c10::optional const& maybe_group_scales, - c10::optional const& maybe_group_zeros, - c10::optional maybe_group_size, - c10::optional const& maybe_channel_scales, - c10::optional const& maybe_token_scales, - c10::optional maybe_schedule) { + std::optional const& maybe_out_type, + std::optional const& maybe_group_scales, + std::optional const& maybe_group_zeros, + std::optional maybe_group_size, + std::optional const& maybe_channel_scales, + std::optional const& maybe_token_scales, + std::optional maybe_schedule) { ScalarType const b_type = ScalarType::from_id(b_type_id); return mm_dispatch({.A = A, .B = B, @@ -51,7 +51,7 @@ torch::Tensor mm(torch::Tensor const& A, torch::Tensor const& B, torch::Tensor prepack_B( torch::Tensor const& B, at::ScalarType const& a_type, int64_t b_type_id, - c10::optional const& maybe_group_scales_type) { + std::optional const& maybe_group_scales_type) { ScalarType const b_type = ScalarType::from_id(b_type_id); return prepack_B_dispatch( {.B = B, diff --git a/csrc/rocm/attention.cu b/csrc/rocm/attention.cu index b48348a515c8d..0fec9624c457e 100644 --- a/csrc/rocm/attention.cu +++ b/csrc/rocm/attention.cu @@ -928,7 +928,7 @@ void paged_attention_custom_launcher( torch::Tensor& tmp_out, torch::Tensor& query, torch::Tensor& key_cache, torch::Tensor& value_cache, const int num_kv_heads, float scale, torch::Tensor& block_tables, torch::Tensor& context_lens, - int max_context_len, const c10::optional& alibi_slopes, + int max_context_len, const std::optional& alibi_slopes, float k_scale, float v_scale) { int num_seqs = query.size(0); int num_heads = query.size(1); @@ -1086,7 +1086,7 @@ void paged_attention( torch::Tensor& block_tables, // [num_seqs, max_num_blocks_per_seq] torch::Tensor& context_lens, // [num_seqs] int64_t block_size, int64_t max_context_len, - const c10::optional& alibi_slopes, + const std::optional& alibi_slopes, const std::string& kv_cache_dtype, double k_scale, double v_scale) { const int head_size = query.size(2); if (kv_cache_dtype == "auto") { diff --git a/csrc/rocm/ops.h b/csrc/rocm/ops.h index 9f085115a3956..34b2f9ce8a4c4 100644 --- a/csrc/rocm/ops.h +++ b/csrc/rocm/ops.h @@ -9,6 +9,6 @@ void paged_attention(torch::Tensor& out, torch::Tensor& exp_sums, double scale, torch::Tensor& block_tables, torch::Tensor& context_lens, int64_t block_size, int64_t max_context_len, - const c10::optional& alibi_slopes, + const std::optional& alibi_slopes, const std::string& kv_cache_dtype, double k_scale, double v_scale); diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu index 6223dc8cca704..5a1879787c328 100644 --- a/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu +++ b/csrc/sparse/cutlass/sparse_scaled_mm_c3x.cu @@ -286,7 +286,7 @@ void cutlass_scaled_sparse_mm_sm90(torch::Tensor& out, torch::Tensor const& a, torch::Tensor const& bt_meta, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias) { + std::optional const& bias) { TORCH_CHECK(a_scales.dtype() == torch::kFloat32); TORCH_CHECK(b_scales.dtype() == torch::kFloat32); if (bias) { diff --git a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu index d464b045b895f..371de0950bc99 100644 --- a/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu +++ b/csrc/sparse/cutlass/sparse_scaled_mm_entry.cu @@ -22,7 +22,7 @@ void cutlass_scaled_sparse_mm_sm90(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& e, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias); + std::optional const& bias); #endif void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a, @@ -30,7 +30,7 @@ void cutlass_scaled_sparse_mm(torch::Tensor& c, torch::Tensor const& a, torch::Tensor const& bt_meta, torch::Tensor const& a_scales, torch::Tensor const& b_scales, - c10::optional const& bias) { + std::optional const& bias) { // Checks for conformality TORCH_CHECK(a.dim() == 2 && bt_nzs.dim() == 2 && c.dim() == 2); TORCH_CHECK(c.size(1) == bt_nzs.size(0) && bt_nzs.size(1) * 2 == a.size(1) &&