added cublas for contract

Qiskit · Nov 28, 2024 · e1e80d1 · e1e80d1
1 parent a4dbd12
commit e1e80d1
Show file tree

Hide file tree

Showing 5 changed files with 202 additions and 19 deletions.
diff --git a/src/simulators/matrix_product_state/matrix_product_state.hpp b/src/simulators/matrix_product_state/matrix_product_state.hpp
@@ -363,7 +363,7 @@ void State::set_config(const Config &config) {
 
   // Set device for SVD
 #ifdef AER_THRUST_CUDA
-  MPS::set_mps_svd_device(config.device);
+  MPS::set_mps_device(config.device);
 #endif // AER_THRUST_CUDA
 }
 

diff --git a/src/simulators/matrix_product_state/matrix_product_state_internal.cpp b/src/simulators/matrix_product_state/matrix_product_state_internal.cpp
@@ -47,7 +47,7 @@ std::stringstream MPS::logging_str_;
 bool MPS::mps_log_data_ = 0;
 bool MPS::mps_lapack_ = false;
 #ifdef AER_THRUST_CUDA
-std::string MPS::mps_svd_device_;
+std::string MPS::mps_device_;
 #endif // AER_THRUST_CUDA
 
 //------------------------------------------------------------------------
@@ -631,8 +631,14 @@ void MPS::common_apply_2_qubit_gate(
   if (A + 1 != num_qubits_ - 1)
     q_reg_[A + 1].mul_Gamma_by_right_Lambda(lambda_reg_[A + 1]);
 
+#ifdef AER_THRUST_CUDA
+  MPS_Tensor temp =
+      MPS_Tensor::contract(q_reg_[A], lambda_reg_[A], q_reg_[A + 1],
+                           MPS::mps_device_, num_qubits_, cublas_handle);
+#else
   MPS_Tensor temp =
       MPS_Tensor::contract(q_reg_[A], lambda_reg_[A], q_reg_[A + 1]);
+#endif // AER_THRUST_CUDA
 
   switch (gate_type) {
   case cx:
@@ -669,8 +675,8 @@ void MPS::common_apply_2_qubit_gate(
   rvector_t lambda;
 #ifdef AER_THRUST_CUDA
   double discarded_value = MPS_Tensor::Decompose(
-      temp, left_gamma, lambda, right_gamma, MPS::mps_lapack_,
-      MPS::mps_svd_device_, cuda_stream, cutensor_handle);
+      temp, left_gamma, lambda, right_gamma, MPS::mps_lapack_, MPS::mps_device_,
+      num_qubits_, cuda_stream, cutensor_handle);
 #else
   double discarded_value = MPS_Tensor::Decompose(temp, left_gamma, lambda,
                                                  right_gamma, MPS::mps_lapack_);
@@ -1301,6 +1307,7 @@ MPS_Tensor MPS::state_vec_as_MPS(const reg_t &qubits) {
 }
 
 MPS_Tensor MPS::state_vec_as_MPS(uint_t first_index, uint_t last_index) const {
+
   MPS_Tensor temp = q_reg_[first_index];
 
   if (first_index != 0)
@@ -1313,8 +1320,14 @@ MPS_Tensor MPS::state_vec_as_MPS(uint_t first_index, uint_t last_index) const {
   }
 
   for (uint_t i = first_index + 1; i < last_index + 1; i++) {
+#ifdef AER_THRUST_CUDA
+    temp = MPS_Tensor::contract(temp, lambda_reg_[i - 1], q_reg_[i],
+                                MPS::mps_device_, num_qubits_, cublas_handle);
+#else
     temp = MPS_Tensor::contract(temp, lambda_reg_[i - 1], q_reg_[i]);
+#endif // AER_THRUST_CUDA
   }
+
   // now temp is a tensor of 2^n matrices
   if (last_index != num_qubits_ - 1)
     temp.mul_Gamma_by_right_Lambda(lambda_reg_[last_index]);
@@ -1815,7 +1828,23 @@ void MPS::initialize_from_matrix(uint_t num_qubits, const cmatrix_t &mat) {
     S.resize(std::min(reshaped_matrix.GetRows(), reshaped_matrix.GetColumns()));
 
 #ifdef AER_THRUST_CUDA
-    if (MPS::mps_svd_device_.compare("GPU") == 0) {
+    // Before offloading to the GPU, we need to make sure the size of
+    // matrices to be operated on should be big enough to make
+    // the time taken by the initialization of `cutensornet` worth it.
+    // Size of matrices involved for qubits < 13 is too small to be
+    // considered to offload to the GPU.
+    // Even if num_qubits involved in the operation is > 13, still
+    // a lot of the matrices are too small to be offloaded, so to determine
+    // the size of matrices to offload, we can look at the number of elements
+    // it has. The number of elements `8401` and num_qubits > 13 has been
+    // obtained not on any theoritical basis, but on trial and error.
+    // The number of elements and number of qubits depends solely on
+    // difference between the speed of CPU and GPU involved. Even if matrices
+    // are big, they are not big enough to make speed of PICe a bottleneck.
+    // In this particular case CPU was `Zen+` and GPU was `NVIDIA Ampere`.
+    if ((num_qubits_ > 13) && (MPS::mps_device_.compare("GPU") == 0) &&
+        ((reshaped_matrix.GetRows() * reshaped_matrix.GetColumns()) > 8401)) {
+
       cutensor_csvd_wrapper(reshaped_matrix, U, S, V, cuda_stream,
                             cutensor_handle);
     } else {

diff --git a/src/simulators/matrix_product_state/matrix_product_state_internal.hpp b/src/simulators/matrix_product_state/matrix_product_state_internal.hpp
@@ -85,16 +85,21 @@ class MPS {
 #ifdef AER_THRUST_CUDA
     cuda_stream = NULL;
     cutensor_handle = NULL;
-    if (mps_svd_device_.compare("GPU") == 0) {
+    cublas_handle = NULL;
+    if (mps_device_.compare("GPU") == 0) {
       cudaStreamCreate(&cuda_stream);
       cutensornetCreate(&cutensor_handle);
+      cublasCreate(&cublas_handle);
+      cublasSetStream(cublas_handle, cuda_stream);
     }
 #endif // AER_THRUST_CUDA
   }
   ~MPS() {
 #ifdef AER_THRUST_CUDA
     if (cutensor_handle)
       cutensornetDestroy(cutensor_handle);
+    if (cublas_handle)
+      cublasDestroy(cublas_handle);
     if (cuda_stream)
       cudaStreamDestroy(cuda_stream);
 #endif // AER_THRUST_CUDA
@@ -338,8 +343,8 @@ class MPS {
 
   static void set_mps_lapack_svd(bool mps_lapack) { mps_lapack_ = mps_lapack; }
 #ifdef AER_THRUST_CUDA
-  static void set_mps_svd_device(std::string mps_svd_device) {
-    mps_svd_device_ = mps_svd_device;
+  static void set_mps_device(std::string mps_device) {
+    mps_device_ = mps_device;
   }
 #endif // AER_THRUST_CUDA
 
@@ -568,6 +573,7 @@ class MPS {
 #ifdef AER_THRUST_CUDA
   cudaStream_t cuda_stream;
   cutensornetHandle_t cutensor_handle;
+  cublasHandle_t cublas_handle;
 #endif // AER_THRUST_CUDA
 
   struct ordering {
@@ -597,7 +603,7 @@ class MPS {
   static MPS_swap_direction mps_swap_direction_;
   static bool mps_lapack_;
 #ifdef AER_THRUST_CUDA
-  static std::string mps_svd_device_;
+  static std::string mps_device_;
 #endif // AER_THRUST_CUDA
 };
 

diff --git a/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp b/src/simulators/matrix_product_state/matrix_product_state_tensor.hpp
@@ -15,6 +15,19 @@
 #ifndef _tensor_tensor_hpp_
 #define _tensor_tensor_hpp_
 
+#ifdef AER_THRUST_CUDA
+#include <cublas_v2.h>
+#define HANDLE_CUBLAS_ERROR(x)                                                 \
+  {                                                                            \
+    const auto err = x;                                                        \
+    if (err != CUBLAS_STATUS_SUCCESS) {                                        \
+      printf("Error: %s in line %d/n", cublasGetStatusString(err), __LINE__);  \
+      fflush(stdout);                                                          \
+    }                                                                          \
+  };
+
+#endif // AER_THRUST_CUDA
+
 #include <complex>
 #include <cstdio>
 #include <exception>
@@ -154,19 +167,33 @@ class MPS_Tensor {
   void mul_Gamma_by_right_Lambda(const rvector_t &Lambda);
   void div_Gamma_by_left_Lambda(const rvector_t &Lambda);
   void div_Gamma_by_right_Lambda(const rvector_t &Lambda);
-  static MPS_Tensor contract(const MPS_Tensor &left_gamma,
-                             const rvector_t &lambda,
-                             const MPS_Tensor &right_gamma, bool mul_by_lambda);
+
 #ifdef AER_THRUST_CUDA
   static double Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma,
                           rvector_t &lambda, MPS_Tensor &right_gamma,
-                          bool mps_lapack, std::string mps_svd_device,
-                          cudaStream_t &cuda_stream,
+                          bool mps_lapack, std::string mps_device,
+                          uint_t num_qubits, cudaStream_t &cuda_stream,
                           cutensornetHandle_t &cutensor_handle);
+
+  static MPS_Tensor contract(const MPS_Tensor &left_gamma,
+                             const rvector_t &lambda,
+                             const MPS_Tensor &right_gamma,
+                             std::string mps_device, uint_t num_qubits,
+                             const cublasHandle_t &cublas_handle,
+                             bool mul_by_lambda);
+
+  static void cublas_Zgemm_(cmatrix_t &A, cmatrix_t &B, cmatrix_t &C,
+                            const cublasHandle_t &cublas_handle);
+
 #else
   static double Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma,
                           rvector_t &lambda, MPS_Tensor &right_gamma,
                           bool mps_lapack);
+
+  static MPS_Tensor contract(const MPS_Tensor &left_gamma,
+                             const rvector_t &lambda,
+                             const MPS_Tensor &right_gamma, bool mul_by_lambda);
+
 #endif // AER_THRUST_CUDA
 
   static void
@@ -503,6 +530,65 @@ void MPS_Tensor::mul_Gamma_by_Lambda(const rvector_t &Lambda,
 // 	       tensors to contract.
 // Returns: The result tensor of the contract
 //---------------------------------------------------------------
+#ifdef AER_THRUST_CUDA
+MPS_Tensor MPS_Tensor::contract(const MPS_Tensor &left_gamma,
+                                const rvector_t &lambda,
+                                const MPS_Tensor &right_gamma,
+                                std::string mps_device, uint_t num_qubits,
+                                const cublasHandle_t &cublas_handle,
+                                bool mul_by_lambda = true) {
+  MPS_Tensor Res;
+  MPS_Tensor new_left = left_gamma;
+  if (mul_by_lambda) {
+    new_left.mul_Gamma_by_right_Lambda(lambda);
+  }
+
+  for (uint_t idx1 = 0; idx1 < new_left.data_.size(); idx1++) {
+    for (uint_t idx2 = 0; idx2 < right_gamma.data_.size(); idx2++) {
+
+      // Before offloading to the GPU, we need to make sure the size of
+      // matrices to be operated on should be big enough to make the time
+      // taken by the initialization of involved GPU routines worth it.
+      // Size of matrices involved for qubits < 16 is too small to be
+      // considered to offload to the GPU.
+      // Even if num_qubits involved in the operation is > 16, still
+      // a lot of the matrices are too small to be offloaded.
+      // Matrices with rows/cols > 128 when offloaded to GPU for multiplication
+      // show speed up when compared to the operation done on the CPU.
+      // This observation was obtained on trial and error, and, does not have
+      // any theoritical basis.
+      // The number of rows/cols and number of qubits depends solely on
+      // difference between the speed of CPU and GPU involved. Even if matrices
+      // are big, they are not big enough to make speed of PICe a bottleneck.
+      // In this particular case CPU was `Zen+` and GPU was `NVIDIA Ampere`.
+
+      if ((num_qubits > 16) && (mps_device.compare("GPU") == 0)) {
+        cmatrix_t mat1 = new_left.data_[idx1], mat2 = right_gamma.data_[idx2];
+
+        uint_t mat1_rows = mat1.GetRows(), mat1_cols = mat1.GetColumns(),
+               mat2_rows = mat2.GetRows(), mat2_cols = mat2.GetColumns();
+
+        // If the mps_device is set to `GPU`, also need to make
+        // sure if the matrix is large enough to make it worth
+        // invoking needed GPU routines.
+        // If the matris is not big enough, the multiplication
+        // will be done on CPU using openblas zgemm_ routine.
+        if ((mat1_rows > 128) && (mat1_cols > 128) && (mat2_cols > 128)) {
+          cmatrix_t mat1_mat2(mat1_rows, mat2_cols);
+          cublas_Zgemm_(mat1, mat2, mat1_mat2, cublas_handle);
+          Res.data_.push_back(mat1_mat2);
+        } else {
+          Res.data_.push_back(mat1 * mat2);
+        }
+      } else {
+        Res.data_.push_back(new_left.data_[idx1] * right_gamma.data_[idx2]);
+      }
+    }
+  }
+
+  return Res;
+}
+#else
 MPS_Tensor MPS_Tensor::contract(const MPS_Tensor &left_gamma,
                                 const rvector_t &lambda,
                                 const MPS_Tensor &right_gamma,
@@ -514,11 +600,11 @@ MPS_Tensor MPS_Tensor::contract(const MPS_Tensor &left_gamma,
   }
   for (uint_t i = 0; i < new_left.data_.size(); i++)
     for (uint_t j = 0; j < right_gamma.data_.size(); j++) {
-
       Res.data_.push_back(new_left.data_[i] * right_gamma.data_[j]);
     }
   return Res;
 }
+#endif
 
 //---------------------------------------------------------------
 // Function name: contract_2_dimensions
@@ -604,8 +690,8 @@ void MPS_Tensor::contract_2_dimensions(const MPS_Tensor &left_gamma,
 #ifdef AER_THRUST_CUDA
 double MPS_Tensor::Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma,
                              rvector_t &lambda, MPS_Tensor &right_gamma,
-                             bool mps_lapack, std::string mps_svd_device,
-                             cudaStream_t &cuda_stream,
+                             bool mps_lapack, std::string mps_device,
+                             uint_t num_qubits, cudaStream_t &cuda_stream,
                              cutensornetHandle_t &cutensor_handle)
 #else
 double MPS_Tensor::Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma,
@@ -619,7 +705,22 @@ double MPS_Tensor::Decompose(MPS_Tensor &temp, MPS_Tensor &left_gamma,
   rvector_t S(std::min(C.GetRows(), C.GetColumns()));
 
 #ifdef AER_THRUST_CUDA
-  if (mps_svd_device.compare("GPU") == 0) {
+  // Before offloading to the GPU, we need to make sure the size of
+  // matrices to be operated on should be big enough to make
+  // the time taken by the initialization of `cutensornet` worth it.
+  // Size of matrices involved for qubits < 13 is too small to be
+  // considered to offload to the GPU.
+  // Even if num_qubits involved in the operation is > 13, still
+  // a lot of the matrices are too small to be offloaded, so to determine
+  // the size of matrices to offload, we can look at the number of elements
+  // it has. The number of elements `8401` and num_qubits > 13 has been
+  // obtained not on any theoritical basis, but on trial and error.
+  // The number of elements and number of qubits depends solely on
+  // difference between the speed of CPU and GPU involved. Even if matrices
+  // are big, they are not big enough to make speed of PICe a bottleneck.
+  // In this particular case CPU was `Zen+` and GPU was `NVIDIA Ampere`.
+  if ((num_qubits > 13) && (mps_device.compare("GPU") == 0) &&
+      ((C.GetRows() * C.GetColumns()) > 8401)) {
     cutensor_csvd_wrapper(C, U, S, V, cuda_stream, cutensor_handle);
   } else {
     csvd_wrapper(C, U, S, V, mps_lapack);
@@ -660,6 +761,54 @@ void MPS_Tensor::reshape_for_3_qubits_before_SVD(
   reshaped_tensor = MPS_Tensor(new_data_vector);
 }
 
+#ifdef AER_THRUST_CUDA
+void MPS_Tensor::cublas_Zgemm_(cmatrix_t &A, cmatrix_t &B, cmatrix_t &C,
+                               const cublasHandle_t &cublas_handle) {
+  size_t A_rows = A.GetRows(), A_cols = A.GetColumns(), B_rows = B.GetRows(),
+         B_cols = B.GetColumns(),
+         size_cuDoubleComplex = sizeof(cuDoubleComplex);
+
+  size_t size_A = A_rows * A_cols * size_cuDoubleComplex,
+         size_B = B_rows * B_cols * size_cuDoubleComplex,
+         size_C = A_rows * B_cols * size_cuDoubleComplex;
+
+  cuDoubleComplex *A_data_dev, *B_data_dev, *C_data_dev;
+
+  HANDLE_CUDA_ERROR(cudaMalloc((void **)&A_data_dev, size_A));
+  HANDLE_CUDA_ERROR(cudaMalloc((void **)&B_data_dev, size_B));
+  HANDLE_CUDA_ERROR(cudaMalloc((void **)&C_data_dev, size_C));
+
+  complex_t *A_data = A.move_to_buffer(), *B_data = B.move_to_buffer();
+
+  HANDLE_CUDA_ERROR(
+      cudaMemcpy(A_data_dev, A_data, size_A, cudaMemcpyHostToDevice));
+  HANDLE_CUDA_ERROR(
+      cudaMemcpy(B_data_dev, B_data, size_B, cudaMemcpyHostToDevice));
+
+  A = std::move(cmatrix_t::move_from_buffer(A_rows, A_cols, A_data));
+  B = std::move(cmatrix_t::move_from_buffer(B_rows, B_cols, B_data));
+
+  cuDoubleComplex alpha = make_cuDoubleComplex(1.0, 0.0),
+                  beta = make_cuDoubleComplex(0.0, 0.0);
+
+  cublasZgemm(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, A_rows, B_cols, A_cols,
+              &alpha, reinterpret_cast<const cuDoubleComplex *>(A_data_dev),
+              A_rows, reinterpret_cast<const cuDoubleComplex *>(B_data_dev),
+              B_rows, &beta, C_data_dev, A_rows);
+
+  complex_t *C_res = reinterpret_cast<complex_t *>(malloc(size_C));
+  HANDLE_CUDA_ERROR(
+      cudaMemcpy(C_res, C_data_dev, size_C, cudaMemcpyDeviceToHost));
+  C = std::move(cmatrix_t::copy_from_buffer(A_rows, B_cols, C_res));
+
+  HANDLE_CUDA_ERROR(cudaFree(A_data_dev));
+  HANDLE_CUDA_ERROR(cudaFree(B_data_dev));
+  HANDLE_CUDA_ERROR(cudaFree(C_data_dev));
+  if (C_res)
+    free(C_res);
+}
+#endif // AER_THRUST_CUDA
+
 //-------------------------------------------------------------------------
 } // end namespace MatrixProductState
 //-------------------------------------------------------------------------

diff --git a/src/simulators/matrix_product_state/svd.cpp b/src/simulators/matrix_product_state/svd.cpp
@@ -672,7 +672,6 @@ void lapack_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
 void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
                            cmatrix_t &V, cudaStream_t &stream,
                            cutensornetHandle_t &handle) {
-
   bool transposed = false;
 
   const int64_t rows = A.GetRows(), cols = A.GetColumns();