From e3eb2df311806625e3972b4ae456a9fc0abdba0f Mon Sep 17 00:00:00 2001 From: venkat Date: Mon, 13 Jul 2020 17:50:14 +0530 Subject: [PATCH] solution to problem-set-2 --- Problem Sets/Problem Set 2/student_func.cu | 72 +++++++++++++++++----- 1 file changed, 57 insertions(+), 15 deletions(-) diff --git a/Problem Sets/Problem Set 2/student_func.cu b/Problem Sets/Problem Set 2/student_func.cu index 825e412b..c5988aa2 100755 --- a/Problem Sets/Problem Set 2/student_func.cu +++ b/Problem Sets/Problem Set 2/student_func.cu @@ -100,6 +100,7 @@ //**************************************************************************** +#include "reference_calc.cpp" #include "utils.h" __global__ @@ -108,7 +109,7 @@ void gaussian_blur(const unsigned char* const inputChannel, int numRows, int numCols, const float* const filter, const int filterWidth) { - // TODO + // DONE // NOTE: Be sure to compute any intermediate results in floating point // before storing the final result as unsigned char. @@ -129,6 +130,24 @@ void gaussian_blur(const unsigned char* const inputChannel, // the value is out of bounds), you should explicitly clamp the neighbor values you read // to be within the bounds of the image. If this is not clear to you, then please refer // to sequential reference solution for the exact clamping semantics you should follow. + + const int2 2D_thread_pos = make_int2(threadIdx.x + blockIdx.x*blockDim.x, threadIdx.y + blockIdx.y*blockDim.y); + + if(2D_thread_pos.x >= numCols || 2D_thread_pos.y >= numRows) + return; + + float intermediate_result = 0; + + for(int r_filter = -filterWidth/2 ; r_filter <= filterWidth/2 ; ++r_filter){ + for(int c_filter = -filterWidth/2 ; c_filter <= filterWidth/2 ; ++c_filter){ + int r_image = max(0, min( static_cast(numRows - 1) , r_filter + 2D_thread_pos.y)); + int c_image = max(0, min( static_cast(numCols - 1) , c_filter + 2D_thread_pos.x)); + float temp = static_cast inputChannel[r_image*numCols + c_image]; + intermediate_result += filter[(r_filter+filterWidth/2)*filterWidth + c_filter + filterWidth/2] * temp; + } + } + + outputChannel[2D_thread_pos.x + numCols*2D_thread_pos.y] = static_cast(intermediate_result); } //This kernel takes in an image represented as a uchar4 and splits @@ -141,7 +160,7 @@ void separateChannels(const uchar4* const inputImageRGBA, unsigned char* const greenChannel, unsigned char* const blueChannel) { - // TODO + // DONE // // NOTE: Be careful not to try to access memory that is outside the bounds of // the image. You'll want code that performs the following check before accessing @@ -152,6 +171,19 @@ void separateChannels(const uchar4* const inputImageRGBA, // { // return; // } + + const int2 thread_2D_pos = make_int2( blockIdx.x * blockDim.x + threadIdx.x, + blockIdx.y * blockDim.y + threadIdx.y); + + const int thread_1D_pos = thread_2D_pos.y * numCols + thread_2D_pos.x; + + if(thread_2D_pos.x >= numCols || thread_2D_pos.y >= numRows) + return; + + redChannel[thread_1D_pos] = inputImageRGBA[thread_1D_pos].x; + greenChannel[thread_1D_pos] = inputImageRGBA[thread_1D_pos].y; + blueChannel[thread_1D_pos] = inputImageRGBA[thread_1D_pos].z; + } //This kernel takes in three color channels and recombines them @@ -198,18 +230,20 @@ void allocateMemoryAndCopyToGPU(const size_t numRowsImage, const size_t numColsI checkCudaErrors(cudaMalloc(&d_green, sizeof(unsigned char) * numRowsImage * numColsImage)); checkCudaErrors(cudaMalloc(&d_blue, sizeof(unsigned char) * numRowsImage * numColsImage)); - //TODO: + //DONE: //Allocate memory for the filter on the GPU //Use the pointer d_filter that we have already declared for you //You need to allocate memory for the filter with cudaMalloc //be sure to use checkCudaErrors like the above examples to //be able to tell if anything goes wrong //IMPORTANT: Notice that we pass a pointer to a pointer to cudaMalloc + checkCudaErrors(cudaMalloc(%d_filter, sizeof(float) * filterWidth * filterWidth)); - //TODO: + //DONE: //Copy the filter on the host (h_filter) to the memory you just allocated //on the GPU. cudaMemcpy(dst, src, numBytes, cudaMemcpyHostToDevice); //Remember to use checkCudaErrors! + cudaCheckErrors(cudaMemcpy(d_filter, h_filter, sizeof(float) * filterWidth * filterWidth, cudaMemcpyHostToDevice)); } @@ -220,25 +254,32 @@ void your_gaussian_blur(const uchar4 * const h_inputImageRGBA, uchar4 * const d_ unsigned char *d_blueBlurred, const int filterWidth) { - //TODO: Set reasonable block size (i.e., number of threads per block) - const dim3 blockSize; - - //TODO: + //DONE: Set reasonable block size (i.e., number of threads per block) + const dim3 blockSize(32,32); + //DONE: //Compute correct grid size (i.e., number of blocks per kernel launch) //from the image size and and block size. - const dim3 gridSize; - - //TODO: Launch a kernel for separating the RGBA image into different color channels + const dim3 gridSize((int)(numRows-1)/blockSize.x + 1, (int)(numCols-1)/blockSize.y + 1 ); + //DONE: Launch a kernel for separating the RGBA image into different color channels + separateChannels<<>>(d_inputImageRGBA,(int) numRows,(int) numCols, d_red, d_green, d_blue); + + // Call cudaDeviceSynchronize(), then call checkCudaErrors() immediately after // launching your kernel to make sure that you didn't make any mistakes. cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); - //TODO: Call your convolution kernel here 3 times, once for each color channel. - + //DONE: Call your convolution kernel here 3 times, once for each color channel. + gaussian_blur<<< gridSize, blockSize >>>(d_red, d_redBlurred, numRows, numCols, d_filter, filterWidth); // Again, call cudaDeviceSynchronize(), then call checkCudaErrors() immediately after // launching your kernel to make sure that you didn't make any mistakes. cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); + gaussian_blur<<< gridSize, blockSize >>>(d_green, d_greenBlurred, numRows, numCols, d_filter, filterWidth); + cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); + gaussian_blur<<< gridSize, blockSize >>>(d_blue, d_blueBlurred, numRows, numCols, d_filter, filterWidth); + cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); + + // Now we recombine your results. We take care of launching this kernel for you. // @@ -251,14 +292,15 @@ void your_gaussian_blur(const uchar4 * const h_inputImageRGBA, uchar4 * const d_ numRows, numCols); cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError()); - } //Free all the memory that we allocated -//TODO: make sure you free any arrays that you allocated +//DONE: make sure you free any arrays that you allocated void cleanup() { checkCudaErrors(cudaFree(d_red)); checkCudaErrors(cudaFree(d_green)); checkCudaErrors(cudaFree(d_blue)); + checkCudaErrors(cudaFree(d_filter)); + }