Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MPS with cuQuantum #2168

Open
wants to merge 36 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 13 commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
563ae6e
initial layout
MozammilQ Jun 5, 2024
280f868
refactor code
MozammilQ Jun 5, 2024
ae44c69
refactor code
MozammilQ Jun 6, 2024
5b48265
refactor code
MozammilQ Jun 6, 2024
517a554
refactor code
MozammilQ Jun 6, 2024
7e40588
refactor code
MozammilQ Jun 6, 2024
ebf9ca0
refactor code
MozammilQ Jun 6, 2024
a422690
refactor code
MozammilQ Jun 6, 2024
80b59d5
refactor code
MozammilQ Jun 6, 2024
52f1ed4
refactor code
MozammilQ Jun 7, 2024
ed43e71
refactor code
MozammilQ Jun 9, 2024
649a5d7
refactor code
MozammilQ Jun 9, 2024
83e4b5e
Merge branch 'main' into mps-cutensor
doichanj Jun 10, 2024
c33571f
refactor code
MozammilQ Jun 11, 2024
abc5552
Merge branch 'main' into mps-cutensor
doichanj Jun 14, 2024
f0205e3
refactor code
MozammilQ Jun 14, 2024
629f65f
refactor code
MozammilQ Jun 15, 2024
644a822
added release note
MozammilQ Jun 16, 2024
e6f2288
refactor code
MozammilQ Jun 17, 2024
42f983e
Merge branch 'Qiskit:main' into mps-cutensor
MozammilQ Jun 17, 2024
c24b9e2
refactor code
MozammilQ Jun 18, 2024
34e9502
refactor code
MozammilQ Jun 18, 2024
00f88e9
refactor code; included test
MozammilQ Jun 18, 2024
454f8c0
lint
MozammilQ Jun 18, 2024
985c7f2
added suggestion
MozammilQ Jun 18, 2024
7ffab7d
Merge branch 'main' into mps-cutensor
doichanj Jul 4, 2024
6b0b41d
Merge branch 'main' into mps-cutensor
MozammilQ Aug 30, 2024
34a5e75
fixed a typo
MozammilQ Aug 31, 2024
a1ae308
refactor code
MozammilQ Sep 10, 2024
859e946
Merge branch 'Qiskit:main' into mps-cutensor
MozammilQ Oct 4, 2024
2ae116e
Merge branch 'Qiskit:main' into mps-cutensor
MozammilQ Nov 6, 2024
ed9a907
refactor code
MozammilQ Nov 11, 2024
a4dbd12
Merge branch 'Qiskit:main' into mps-cutensor
MozammilQ Nov 12, 2024
e1e80d1
added cublas for contract
MozammilQ Nov 28, 2024
321230c
fixed typo
MozammilQ Dec 3, 2024
702ecd0
Merge branch 'Qiskit:main' into mps-cutensor
MozammilQ Dec 12, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
268 changes: 268 additions & 0 deletions src/simulators/matrix_product_state/svd.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
#include <stdlib.h>

namespace AER {

// default values
constexpr auto mul_factor = 1e2;
constexpr long double tiny_factor = 1e30;
Expand Down Expand Up @@ -552,9 +553,23 @@ status csvd(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &V) {
void csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S, cmatrix_t &V,
bool lapack) {
if (lapack) {

#ifdef AER_THRUST_CUDA
cutensor_csvd_wrapper(A, U, S, V);
#endif // AER_THRUST_CUDA

#ifndef AER_THRUST_CUDA
lapack_csvd_wrapper(A, U, S, V);
#endif // AER_THRUST_CUDA
} else {

#ifdef AER_THRUST_CUDA
cutensor_csvd_wrapper(A, U, S, V);
#endif // AER_THRUST_CUDA

#ifndef AER_THRUST_CUDA
qiskit_csvd_wrapper(A, U, S, V);
#endif // AER_THRUST_CUDA
MozammilQ marked this conversation as resolved.
Show resolved Hide resolved
}
}

Expand Down Expand Up @@ -667,4 +682,257 @@ void lapack_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
}
}

#ifdef AER_THRUST_CUDA

#include <cuda.h>
#include <cuda_runtime.h>
#include <cutensornet.h>
#include <vector>

#define HANDLE_ERROR(x) \
{ \
const auto err = x; \
if (err != CUTENSORNET_STATUS_SUCCESS) { \
std::stringstream str; \
str << "ERROR TensorNet::contractor : " \
<< cutensornetGetErrorString(err); \
throw std::runtime_error(str.str()); \
} \
};

#define HANDLE_CUDA_ERROR(x) \
{ \
const auto err = x; \
if (err != cudaSuccess) { \
std::stringstream str; \
str << "ERROR TensorNet::contractor : " << cudaGetErrorString(err); \
throw std::runtime_error(str.str()); \
} \
};

namespace TensorNetwork {
void cutensor_csvd_wrapper(cmatrix_t &A, cmatrix_t &U, rvector_t &S,
cmatrix_t &V) {
const int64_t m = A.GetRows(), n = A.GetColumns();
const int64_t min_dim = std::min(m, n);
const int64_t lda = std::max(m, n);

// Transpose when m < n
bool transposed = false;
if (m < n) {
transposed = true;
A = AER::Utils::dagger(A);
}

complex_t *cutensor_A = A.move_to_buffer(), *cutensor_U = U.move_to_buffer(),
*cutensor_V = V.move_to_buffer();

const size_t cuTensornetVersion = cutensornetGetVersion();

cudaDeviceProp prop;
int deviceId{-1};
HANDLE_CUDA_ERROR(cudaGetDevice(&deviceId));
HANDLE_CUDA_ERROR(cudaGetDeviceProperties(&prop, deviceId));

typedef double floatType;
cudaDataType_t typeData = CUDA_C_64F;

std::vector<int32_t> modesA{'m', 'n'}; // input
std::vector<int32_t> modesU{'n', 'x'};
std::vector<int32_t> modesV{'x', 'n'}; // SVD output

size_t sizeA = sizeof(A);
size_t sizeU = sizeof(U);
size_t sizeS = sizeof(S);
size_t sizeV = sizeof(V);

complex_t *cutensor_S = (complex_t *)malloc(sizeof(S));

void *D_A;
void *D_U;
void *D_S;
void *D_V;

HANDLE_CUDA_ERROR(cudaMalloc((void **)&D_A, sizeA));
HANDLE_CUDA_ERROR(cudaMalloc((void **)&D_U, sizeU));
HANDLE_CUDA_ERROR(cudaMalloc((void **)&D_S, sizeS));
HANDLE_CUDA_ERROR(cudaMalloc((void **)&D_V, sizeV));

HANDLE_CUDA_ERROR(cudaMemcpy(D_A, cutensor_A, sizeA, cudaMemcpyHostToDevice));

cudaStream_t stream;
HANDLE_CUDA_ERROR(cudaStreamCreate(&stream));

cutensornetHandle_t handle;
HANDLE_ERROR(cutensornetCreate(&handle));

cutensornetTensorDescriptor_t descTensorA;
cutensornetTensorDescriptor_t descTensorU;
cutensornetTensorDescriptor_t descTensorV;

const int32_t numModesA = modesA.size();
const int32_t numModesU = modesU.size();
const int32_t numModesV = modesV.size();

std::vector<int64_t> extentA{lda, min_dim}; // shape of A
std::vector<int64_t> extentU{min_dim, min_dim}; // shape of U
std::vector<int64_t> extentS{min_dim}; // shape of S
std::vector<int64_t> extentV{min_dim, min_dim}; // shape of V

const int64_t *strides =
NULL; // matrices stores the entries in column-major-order.

HANDLE_ERROR(cutensornetCreateTensorDescriptor(
handle, numModesA, extentA.data(), strides, modesA.data(), typeData,
&descTensorA));
HANDLE_ERROR(cutensornetCreateTensorDescriptor(
handle, numModesU, extentU.data(), strides, modesU.data(), typeData,
&descTensorU));
HANDLE_ERROR(cutensornetCreateTensorDescriptor(
handle, numModesV, extentV.data(), strides, modesV.data(), typeData,
&descTensorV));

cutensornetTensorSVDConfig_t svdConfig;
HANDLE_ERROR(cutensornetCreateTensorSVDConfig(handle, &svdConfig));

// set up truncation parameters
double absCutoff = 1e-13;
HANDLE_ERROR(cutensornetTensorSVDConfigSetAttribute(
handle, svdConfig, CUTENSORNET_TENSOR_SVD_CONFIG_ABS_CUTOFF, &absCutoff,
sizeof(absCutoff)));
double relCutoff = 4e-13;
HANDLE_ERROR(cutensornetTensorSVDConfigSetAttribute(
handle, svdConfig, CUTENSORNET_TENSOR_SVD_CONFIG_REL_CUTOFF, &relCutoff,
sizeof(relCutoff)));

// optional: choose gesvdj algorithm with customized parameters. Default is
// gesvd.
cutensornetTensorSVDAlgo_t svdAlgo = CUTENSORNET_TENSOR_SVD_ALGO_GESVDJ;
HANDLE_ERROR(cutensornetTensorSVDConfigSetAttribute(
handle, svdConfig, CUTENSORNET_TENSOR_SVD_CONFIG_ALGO, &svdAlgo,
sizeof(svdAlgo)));
cutensornetGesvdjParams_t gesvdjParams{/*tol=*/1e-12, /*maxSweeps=*/80};
HANDLE_ERROR(cutensornetTensorSVDConfigSetAttribute(
handle, svdConfig, CUTENSORNET_TENSOR_SVD_CONFIG_ALGO_PARAMS,
&gesvdjParams, sizeof(gesvdjParams)));

/********************************************************
* Create SVDInfo to record runtime SVD truncation details
*********************************************************/

cutensornetTensorSVDInfo_t svdInfo;
HANDLE_ERROR(cutensornetCreateTensorSVDInfo(handle, &svdInfo));

/**************************************************************
* Query the required workspace sizes and allocate memory
**************************************************************/

cutensornetWorkspaceDescriptor_t workDesc;
HANDLE_ERROR(cutensornetCreateWorkspaceDescriptor(handle, &workDesc));
HANDLE_ERROR(cutensornetWorkspaceComputeSVDSizes(
handle, descTensorA, descTensorU, descTensorV, svdConfig, workDesc));
int64_t hostWorkspaceSize, deviceWorkspaceSize;
// for tensor SVD, it does not matter which cutensornetWorksizePref_t we pick
HANDLE_ERROR(cutensornetWorkspaceGetMemorySize(
handle, workDesc, CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
CUTENSORNET_MEMSPACE_DEVICE, CUTENSORNET_WORKSPACE_SCRATCH,
&deviceWorkspaceSize));
HANDLE_ERROR(cutensornetWorkspaceGetMemorySize(
handle, workDesc, CUTENSORNET_WORKSIZE_PREF_RECOMMENDED,
CUTENSORNET_MEMSPACE_HOST, CUTENSORNET_WORKSPACE_SCRATCH,
&hostWorkspaceSize));

void *devWork = nullptr, *hostWork = nullptr;
if (deviceWorkspaceSize > 0) {
HANDLE_CUDA_ERROR(cudaMalloc(&devWork, deviceWorkspaceSize));
}
if (hostWorkspaceSize > 0) {
hostWork = malloc(hostWorkspaceSize);
}
HANDLE_ERROR(cutensornetWorkspaceSetMemory(
handle, workDesc, CUTENSORNET_MEMSPACE_DEVICE,
CUTENSORNET_WORKSPACE_SCRATCH, devWork, deviceWorkspaceSize));
HANDLE_ERROR(cutensornetWorkspaceSetMemory(
handle, workDesc, CUTENSORNET_MEMSPACE_HOST,
CUTENSORNET_WORKSPACE_SCRATCH, hostWork, hostWorkspaceSize));

/**********
* Execution
***********/

const int numRuns = 3; // to get stable perf results
for (int i = 0; i < numRuns; ++i) {
// restore output
cudaMemsetAsync(D_U, 0, sizeU, stream);
cudaMemsetAsync(D_S, 0, sizeS, stream);
cudaMemsetAsync(D_V, 0, sizeV, stream);
cudaDeviceSynchronize();

// With value-based truncation, `cutensornetTensorSVD` can potentially
// update the shared extent in descTensorU/V. We here restore descTensorU/V
// to the original problem.
HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorU));
HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorV));
HANDLE_ERROR(cutensornetCreateTensorDescriptor(
handle, numModesU, extentU.data(), strides, modesU.data(), typeData,
&descTensorU));
HANDLE_ERROR(cutensornetCreateTensorDescriptor(
handle, numModesV, extentV.data(), strides, modesV.data(), typeData,
&descTensorV));

HANDLE_ERROR(cutensornetTensorSVD(handle, descTensorA, D_A, descTensorU,
D_U, D_S, descTensorV, D_V, svdConfig,
svdInfo, workDesc, stream));
}

HANDLE_CUDA_ERROR(
cudaMemcpyAsync(cutensor_U, D_U, sizeU, cudaMemcpyDeviceToHost));
HANDLE_CUDA_ERROR(
cudaMemcpyAsync(cutensor_S, D_S, sizeS, cudaMemcpyDeviceToHost));
HANDLE_CUDA_ERROR(
cudaMemcpyAsync(cutensor_V, D_V, sizeV, cudaMemcpyDeviceToHost));

S.clear();
for (int i = 0; i < min_dim; i++)
S.push_back(cutensor_S[i].real());

A = cmatrix_t::move_from_buffer(lda, min_dim, cutensor_A);
U = cmatrix_t::move_from_buffer(lda, lda, cutensor_U);
V = cmatrix_t::move_from_buffer(min_dim, min_dim, cutensor_V);

if (transposed) {
transposed = false;
A = AER::Utils::dagger(A);
}

validate_SVD_result(A, U, S, V);

/***************
* Free resources
****************/

HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorA));
HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorU));
HANDLE_ERROR(cutensornetDestroyTensorDescriptor(descTensorV));
HANDLE_ERROR(cutensornetDestroyTensorSVDConfig(svdConfig));
HANDLE_ERROR(cutensornetDestroyTensorSVDInfo(svdInfo));
HANDLE_ERROR(cutensornetDestroyWorkspaceDescriptor(workDesc));
HANDLE_ERROR(cutensornetDestroy(handle));

if (D_A)
cudaFree(D_A);
if (D_U)
cudaFree(D_U);
if (D_S)
cudaFree(D_S);
if (D_V)
cudaFree(D_V);
if (devWork)
cudaFree(devWork);
if (hostWork)
free(hostWork);
}
} // namespace TensorNetwork
#endif // AER_THRUST_CUDA

} // namespace AER
6 changes: 6 additions & 0 deletions src/simulators/matrix_product_state/svd.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,12 @@ void qiskit_csvd_wrapper(cmatrix_t &C, cmatrix_t &U, rvector_t &S,
void lapack_csvd_wrapper(cmatrix_t &C, cmatrix_t &U, rvector_t &S,
cmatrix_t &V);

#ifdef AER_THRUST_CUDA
// cutensor call
void cutensor_csvd_wrapper(cmatrix_t &C, cmatrix_t &U, rvector_t &S,
cmatrix_t &V);
#endif // AER_THRUST_CUDA

void validate_SVD_result(const cmatrix_t &A, const cmatrix_t &U,
const rvector_t &S, const cmatrix_t &V);
void validate_SVdD_result(const cmatrix_t &A, const cmatrix_t &U,
Expand Down
Loading