Skip to content
This repository has been archived by the owner on Jun 27, 2022. It is now read-only.

solution to problem-set-2 #33

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 57 additions & 15 deletions Problem Sets/Problem Set 2/student_func.cu
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@

//****************************************************************************

#include "reference_calc.cpp"
#include "utils.h"

__global__
Expand All @@ -108,7 +109,7 @@ void gaussian_blur(const unsigned char* const inputChannel,
int numRows, int numCols,
const float* const filter, const int filterWidth)
{
// TODO
// DONE

// NOTE: Be sure to compute any intermediate results in floating point
// before storing the final result as unsigned char.
Expand All @@ -129,6 +130,24 @@ void gaussian_blur(const unsigned char* const inputChannel,
// the value is out of bounds), you should explicitly clamp the neighbor values you read
// to be within the bounds of the image. If this is not clear to you, then please refer
// to sequential reference solution for the exact clamping semantics you should follow.

const int2 2D_thread_pos = make_int2(threadIdx.x + blockIdx.x*blockDim.x, threadIdx.y + blockIdx.y*blockDim.y);

if(2D_thread_pos.x >= numCols || 2D_thread_pos.y >= numRows)
return;

float intermediate_result = 0;

for(int r_filter = -filterWidth/2 ; r_filter <= filterWidth/2 ; ++r_filter){
for(int c_filter = -filterWidth/2 ; c_filter <= filterWidth/2 ; ++c_filter){
int r_image = max(0, min( static_cast<int>(numRows - 1) , r_filter + 2D_thread_pos.y));
int c_image = max(0, min( static_cast<int>(numCols - 1) , c_filter + 2D_thread_pos.x));
float temp = static_cast<float> inputChannel[r_image*numCols + c_image];
intermediate_result += filter[(r_filter+filterWidth/2)*filterWidth + c_filter + filterWidth/2] * temp;
}
}

outputChannel[2D_thread_pos.x + numCols*2D_thread_pos.y] = static_cast<int>(intermediate_result);
}

//This kernel takes in an image represented as a uchar4 and splits
Expand All @@ -141,7 +160,7 @@ void separateChannels(const uchar4* const inputImageRGBA,
unsigned char* const greenChannel,
unsigned char* const blueChannel)
{
// TODO
// DONE
//
// NOTE: Be careful not to try to access memory that is outside the bounds of
// the image. You'll want code that performs the following check before accessing
Expand All @@ -152,6 +171,19 @@ void separateChannels(const uchar4* const inputImageRGBA,
// {
// return;
// }

const int2 thread_2D_pos = make_int2( blockIdx.x * blockDim.x + threadIdx.x,
blockIdx.y * blockDim.y + threadIdx.y);

const int thread_1D_pos = thread_2D_pos.y * numCols + thread_2D_pos.x;

if(thread_2D_pos.x >= numCols || thread_2D_pos.y >= numRows)
return;

redChannel[thread_1D_pos] = inputImageRGBA[thread_1D_pos].x;
greenChannel[thread_1D_pos] = inputImageRGBA[thread_1D_pos].y;
blueChannel[thread_1D_pos] = inputImageRGBA[thread_1D_pos].z;

}

//This kernel takes in three color channels and recombines them
Expand Down Expand Up @@ -198,18 +230,20 @@ void allocateMemoryAndCopyToGPU(const size_t numRowsImage, const size_t numColsI
checkCudaErrors(cudaMalloc(&d_green, sizeof(unsigned char) * numRowsImage * numColsImage));
checkCudaErrors(cudaMalloc(&d_blue, sizeof(unsigned char) * numRowsImage * numColsImage));

//TODO:
//DONE:
//Allocate memory for the filter on the GPU
//Use the pointer d_filter that we have already declared for you
//You need to allocate memory for the filter with cudaMalloc
//be sure to use checkCudaErrors like the above examples to
//be able to tell if anything goes wrong
//IMPORTANT: Notice that we pass a pointer to a pointer to cudaMalloc
checkCudaErrors(cudaMalloc(%d_filter, sizeof(float) * filterWidth * filterWidth));

//TODO:
//DONE:
//Copy the filter on the host (h_filter) to the memory you just allocated
//on the GPU. cudaMemcpy(dst, src, numBytes, cudaMemcpyHostToDevice);
//Remember to use checkCudaErrors!
cudaCheckErrors(cudaMemcpy(d_filter, h_filter, sizeof(float) * filterWidth * filterWidth, cudaMemcpyHostToDevice));

}

Expand All @@ -220,25 +254,32 @@ void your_gaussian_blur(const uchar4 * const h_inputImageRGBA, uchar4 * const d_
unsigned char *d_blueBlurred,
const int filterWidth)
{
//TODO: Set reasonable block size (i.e., number of threads per block)
const dim3 blockSize;

//TODO:
//DONE: Set reasonable block size (i.e., number of threads per block)
const dim3 blockSize(32,32);
//DONE:
//Compute correct grid size (i.e., number of blocks per kernel launch)
//from the image size and and block size.
const dim3 gridSize;

//TODO: Launch a kernel for separating the RGBA image into different color channels
const dim3 gridSize((int)(numRows-1)/blockSize.x + 1, (int)(numCols-1)/blockSize.y + 1 );

//DONE: Launch a kernel for separating the RGBA image into different color channels
separateChannels<<<gridSize, blockSize>>>(d_inputImageRGBA,(int) numRows,(int) numCols, d_red, d_green, d_blue);


// Call cudaDeviceSynchronize(), then call checkCudaErrors() immediately after
// launching your kernel to make sure that you didn't make any mistakes.
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());

//TODO: Call your convolution kernel here 3 times, once for each color channel.

//DONE: Call your convolution kernel here 3 times, once for each color channel.
gaussian_blur<<< gridSize, blockSize >>>(d_red, d_redBlurred, numRows, numCols, d_filter, filterWidth);
// Again, call cudaDeviceSynchronize(), then call checkCudaErrors() immediately after
// launching your kernel to make sure that you didn't make any mistakes.
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
gaussian_blur<<< gridSize, blockSize >>>(d_green, d_greenBlurred, numRows, numCols, d_filter, filterWidth);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
gaussian_blur<<< gridSize, blockSize >>>(d_blue, d_blueBlurred, numRows, numCols, d_filter, filterWidth);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());



// Now we recombine your results. We take care of launching this kernel for you.
//
Expand All @@ -251,14 +292,15 @@ void your_gaussian_blur(const uchar4 * const h_inputImageRGBA, uchar4 * const d_
numRows,
numCols);
cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());

}


//Free all the memory that we allocated
//TODO: make sure you free any arrays that you allocated
//DONE: make sure you free any arrays that you allocated
void cleanup() {
checkCudaErrors(cudaFree(d_red));
checkCudaErrors(cudaFree(d_green));
checkCudaErrors(cudaFree(d_blue));
checkCudaErrors(cudaFree(d_filter));

}