矩阵A(MxN) * 矩阵B(NxK) = 矩阵C(MxK) (M行K列)
void gemmOnHost(float *A, float *B, float *C, const int M, const int N, const int K){
for(int mi=0; mi<M; ++mi){
for(int ki=0; ki<K; ++ki){
for(int ni=0; ni<N; ++ni){ // 遍历 A B 相同维度 N
C[mi*K + ki] += A[mi*N + ni] * B[ni*K + ki];
}
}
}
return;
}
__global__ void gemmOnGPU2D(float *A, float *B, float *C, int M, int N, int K){
unsigned int ixk = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int iym = blockIdx.y * blockDim.y + threadIdx.y;
if (ixk < K && iym < M){
float sum = 0;
for(int ni=0; ni<N; ++ni){
sum += A[iym*N + ni] * B[ni*K + ixk];
}
C[iym*K + ixk] = sum;
}
}
//
//
host调用方式:
gemmOnGPU2D<<<((K+block.x-1)/block.x, (M+block.y-1)/block.y), (32, 32)>>>(d_MatA, d_MatB, d_MatC, M, N, K);
线程索引方式参考:
参考上图,就是将每一个block中矩阵A和矩阵B中用到的参数加载到share mem中,在计算时候提高计算访存比;
__global__ void gemmOnGPU2DShareMem(float *A, float *B, float *C, const int M, const int N, const int K){
extern __shared__ float block_AB[];
unsigned int ixk = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int iym = blockIdx.y * blockDim.y + threadIdx.y;
if (ixk < K && iym < M){
for(int ni=0; ni<N; ++ni){
block_AB[threadIdx.y*N + ni] = A[iym*N + ni];
block_AB[blockDim.y*N + ni*blockDim.x + threadIdx.x] = B[ni*K + ixk];
}
__syncthreads();
float sum = 0;
for(int ni=0; ni<N; ++ni){
sum += block_AB[threadIdx.y*N + ni] * block_AB[blockDim.y*N + ni*blockDim.x + threadIdx.x];
}
C[iym*K + ixk] = sum;
}
}
host调用方式:
int shareMemSize = dimx*nn + dimy*nn; // 动态共享内存大小
gemmOnGPU2DShareMem<<<((K+block.x-1)/block.x, (M+block.y-1)/block.y), (32, 32),shareMemSize*sizeof(float)>>>(d_MatA, d_MatB, d_MatC, M, N, K);
- 使用的共享内存有点多: block share mem size = blockDim.x * N + blockDim.y * N
-
1 一个kernel只能分配一块共享内存,可以使用下面的方式分配到不同变量:
extern __shared__ float smem[]; float *sharedM = smem; float *sharedN = (float*)&smem[blockDim.x * blockDim.y];
主要是针对方法3中占用大量共享内存的问题做了进一步优化;目前每块共享内存:block share mem size = blockDim.x*blockDim.y*2
__global__ void gemmOnGPU2DShareMemV2(float *A, float *B, float *C, const int M, const int N, const int K){
extern __shared__ float smem[];
float *sharedM = smem;
float *sharedN = (float*)&smem[blockDim.x * blockDim.y];
int row = blockIdx.y * blockDim.y + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
if (row < M && col < K){
float Csub = 0.0;
for (int i = 0; i < (int)(ceil((float)N / blockDim.x)); i++){
if (i*blockDim.x + threadIdx.x < N)
sharedM[threadIdx.y*blockDim.x + threadIdx.x] = A[row*N + i * blockDim.x + threadIdx.x];
else
sharedM[threadIdx.y*blockDim.x + threadIdx.x] = 0.0;
if (i*blockDim.y + threadIdx.y < N)
sharedN[threadIdx.y*blockDim.x + threadIdx.x] = B[(i*blockDim.y + threadIdx.y)*K + col];
else
sharedN[threadIdx.y*blockDim.x + threadIdx.x] = 0.0;
__syncthreads();
for (int j = 0; j < blockDim.x; j++)
Csub += sharedM[threadIdx.y*blockDim.x + j] * sharedN[j*blockDim.y + threadIdx.x];
__syncthreads();
}
C[row*K + col] = Csub;
}
}
host调用方式:
// 上面的实现要求 blockDim.x == blockDim.y
int shareMemSize = dimx*nn + dimy*nn; // 动态共享内存大小
gemmOnGPU2DShareMemV2<<<((K+block.x-1)/block.x, (M+block.y-1)/block.y), (32, 32),shareMemSize*sizeof(float)>>>(d_MatA, d_MatB, d_MatC, M, N, K);
__global__ void gemmOnGPU2DRegMem(float *A, float *B, float *C, int M, int N, int K){
int row = blockIdx.y * blockDim.y * 2 + threadIdx.y;
int col = blockIdx.x * blockDim.x + threadIdx.x;
float val[2] = {0.0f};
const int BLOCK_SIZE = 32;
__shared__ float shTileA[BLOCK_SIZE][BLOCK_SIZE];
__shared__ float shTileB[BLOCK_SIZE][BLOCK_SIZE];
int iter = (N + BLOCK_SIZE - 1) / BLOCK_SIZE;
if(row < M && col < K){
for(int i = 0; i < iter; i++){
shTileA[threadIdx.y][threadIdx.x]=A[row * N+i*BLOCK_SIZE+threadIdx.x];
shTileA[threadIdx.y+16][threadIdx.x]=A[(row+16)*N+i*BLOCK_SIZE+threadIdx.x];
shTileB[threadIdx.y][threadIdx.x]=B[(i*BLOCK_SIZE+threadIdx.y)*N+col];
shTileB[threadIdx.y+16][threadIdx.x]=B[(i*BLOCK_SIZE+threadIdx.y+16)*N+col];
__syncthreads();
for(int j = 0; j < BLOCK_SIZE; j++){
val[0] += shTileA[threadIdx.y][j] * shTileB[j][threadIdx.x];
val[1] += shTileA[threadIdx.y + 16][j] * shTileB[j][threadIdx.x];
}
__syncthreads();
}
C[row * N + col] = val[0];
C[(row + 16) * N + col] = val[1];
}
}
host调用方式:
dim3 block5(32, 32/2);
dim3 grid5((nk + block.x - 1) / block.x, (nm + block.y - 1) / block.y);
gemmOnGPU2DRegMem<<<grid5, block5>>>(d_MatA, d_MatB, d_MatC, nm, nn, nk);
// 下面的实现方式要求 M==N==K, BLOCK_SIZE = 32 blockDim.x=32, blockDim.y = BLOCK_SIZE / 2
之前按照自己的想法写的那个版本结果总是会有问题,目前参考别人的想法修改的版本限制也比较多,这个方法还待完善
实现方法 \ M*N*K | 128*128*128 | 4096*4096*4096 |
---|---|---|
gemmOnGPU2D | 0.028809 ms | 95.804932 ms |
gemmOnGPU2DShareMem | 0.036865 ms | code: 1, reason: invalid argument |
gemmOnGPU2DShareMemV2 | 0.018066 ms | 93.896973 ms |
gemmOnGPU2DRegMem | 0.011963 ms | 33.145020 ms |
在矩阵4096*4096*4096查看参数情况:
实现方法 \ M*N*K | 线程束占用率 | 全局内存加载吞吐量 | 全局内存加载效率 |
---|---|---|---|
gemmOnGPU2D | 66.66% | 3.16 Tbyte/s | 82.5% |
gemmOnGPU2DShareMemV2 | 66.66% | 155.74 Gbyte/s | 100% |
gemmOnGPU2DRegMem | 99.57% | 437.30 Gbyte/s | 100% |
-
1 调试上面代码时候问题还是比较多的,特别是共享内存那块,自己调试时候问题比较多;
-
2 之前自己理解的“并行”最优就是去掉CPU实现中所有的循环,最近学习才发现内存读取才是耗时最多的。优化的过程主要是隐藏访存延迟和资源分配,感觉还有很多地方学习
-
3 对寄存器 Bank conflict 和 共享内存Bank conflict还是有比较多问题