diff --git a/Problem Sets/Problem Set 2/student_func.cu b/Problem Sets/Problem Set 2/student_func.cu
index 825e412b..c5988aa2 100755
--- a/Problem Sets/Problem Set 2/student_func.cu	
+++ b/Problem Sets/Problem Set 2/student_func.cu	
@@ -100,6 +100,7 @@
 
 //****************************************************************************
 
+#include "reference_calc.cpp"
 #include "utils.h"
 
 __global__
@@ -108,7 +109,7 @@ void gaussian_blur(const unsigned char* const inputChannel,
                    int numRows, int numCols,
                    const float* const filter, const int filterWidth)
 {
-  // TODO
+  // DONE
   
   // NOTE: Be sure to compute any intermediate results in floating point
   // before storing the final result as unsigned char.
@@ -129,6 +130,24 @@ void gaussian_blur(const unsigned char* const inputChannel,
   // the value is out of bounds), you should explicitly clamp the neighbor values you read
   // to be within the bounds of the image. If this is not clear to you, then please refer
   // to sequential reference solution for the exact clamping semantics you should follow.
+
+  const int2 2D_thread_pos = make_int2(threadIdx.x + blockIdx.x*blockDim.x, threadIdx.y + blockIdx.y*blockDim.y);
+
+  if(2D_thread_pos.x >= numCols || 2D_thread_pos.y >= numRows)
+    return;
+
+  float intermediate_result = 0;
+
+  for(int r_filter = -filterWidth/2 ; r_filter <= filterWidth/2 ; ++r_filter){
+    for(int c_filter = -filterWidth/2 ; c_filter <= filterWidth/2 ; ++c_filter){
+      int r_image = max(0, min( static_cast<int>(numRows - 1) , r_filter + 2D_thread_pos.y));
+      int c_image = max(0, min( static_cast<int>(numCols - 1) , c_filter + 2D_thread_pos.x));
+      float temp = static_cast<float> inputChannel[r_image*numCols + c_image];
+      intermediate_result += filter[(r_filter+filterWidth/2)*filterWidth + c_filter + filterWidth/2] * temp;
+    }
+  }
+
+  outputChannel[2D_thread_pos.x + numCols*2D_thread_pos.y] = static_cast<int>(intermediate_result);
 }
 
 //This kernel takes in an image represented as a uchar4 and splits
@@ -141,7 +160,7 @@ void separateChannels(const uchar4* const inputImageRGBA,
                       unsigned char* const greenChannel,
                       unsigned char* const blueChannel)
 {
-  // TODO
+  // DONE
   //
   // NOTE: Be careful not to try to access memory that is outside the bounds of
   // the image. You'll want code that performs the following check before accessing
@@ -152,6 +171,19 @@ void separateChannels(const uchar4* const inputImageRGBA,
   // {
   //     return;
   // }
+
+  const int2 thread_2D_pos = make_int2( blockIdx.x * blockDim.x + threadIdx.x,
+                                      blockIdx.y * blockDim.y + threadIdx.y);
+
+  const int thread_1D_pos = thread_2D_pos.y * numCols + thread_2D_pos.x;
+
+  if(thread_2D_pos.x >= numCols || thread_2D_pos.y >= numRows)
+    return;
+
+  redChannel[thread_1D_pos] = inputImageRGBA[thread_1D_pos].x;
+  greenChannel[thread_1D_pos] = inputImageRGBA[thread_1D_pos].y;
+  blueChannel[thread_1D_pos] = inputImageRGBA[thread_1D_pos].z;
+
 }
 
 //This kernel takes in three color channels and recombines them
@@ -198,18 +230,20 @@ void allocateMemoryAndCopyToGPU(const size_t numRowsImage, const size_t numColsI
   checkCudaErrors(cudaMalloc(&d_green, sizeof(unsigned char) * numRowsImage * numColsImage));
   checkCudaErrors(cudaMalloc(&d_blue,  sizeof(unsigned char) * numRowsImage * numColsImage));
 
-  //TODO:
+  //DONE:
   //Allocate memory for the filter on the GPU
   //Use the pointer d_filter that we have already declared for you
   //You need to allocate memory for the filter with cudaMalloc
   //be sure to use checkCudaErrors like the above examples to
   //be able to tell if anything goes wrong
   //IMPORTANT: Notice that we pass a pointer to a pointer to cudaMalloc
+  checkCudaErrors(cudaMalloc(%d_filter, sizeof(float) * filterWidth * filterWidth));
 
-  //TODO:
+  //DONE:
   //Copy the filter on the host (h_filter) to the memory you just allocated
   //on the GPU.  cudaMemcpy(dst, src, numBytes, cudaMemcpyHostToDevice);
   //Remember to use checkCudaErrors!
+  cudaCheckErrors(cudaMemcpy(d_filter, h_filter, sizeof(float) * filterWidth * filterWidth, cudaMemcpyHostToDevice));
 
 }
 
@@ -220,25 +254,32 @@ void your_gaussian_blur(const uchar4 * const h_inputImageRGBA, uchar4 * const d_
                         unsigned char *d_blueBlurred,
                         const int filterWidth)
 {
-  //TODO: Set reasonable block size (i.e., number of threads per block)
-  const dim3 blockSize;
-
-  //TODO:
+  //DONE: Set reasonable block size (i.e., number of threads per block)
+  const dim3 blockSize(32,32);
+  //DONE:
   //Compute correct grid size (i.e., number of blocks per kernel launch)
   //from the image size and and block size.
-  const dim3 gridSize;
-
-  //TODO: Launch a kernel for separating the RGBA image into different color channels
+  const dim3 gridSize((int)(numRows-1)/blockSize.x + 1, (int)(numCols-1)/blockSize.y + 1 );
 
+  //DONE: Launch a kernel for separating the RGBA image into different color channels
+  separateChannels<<<gridSize, blockSize>>>(d_inputImageRGBA,(int) numRows,(int) numCols, d_red, d_green, d_blue);
+                      
+        
   // Call cudaDeviceSynchronize(), then call checkCudaErrors() immediately after
   // launching your kernel to make sure that you didn't make any mistakes.
   cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
 
-  //TODO: Call your convolution kernel here 3 times, once for each color channel.
-
+  //DONE: Call your convolution kernel here 3 times, once for each color channel.
+  gaussian_blur<<< gridSize, blockSize >>>(d_red, d_redBlurred, numRows, numCols, d_filter, filterWidth);
   // Again, call cudaDeviceSynchronize(), then call checkCudaErrors() immediately after
   // launching your kernel to make sure that you didn't make any mistakes.
   cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+  gaussian_blur<<< gridSize, blockSize >>>(d_green, d_greenBlurred, numRows, numCols, d_filter, filterWidth);
+  cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+  gaussian_blur<<< gridSize, blockSize >>>(d_blue, d_blueBlurred, numRows, numCols, d_filter, filterWidth);
+  cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
+  
+
 
   // Now we recombine your results. We take care of launching this kernel for you.
   //
@@ -251,14 +292,15 @@ void your_gaussian_blur(const uchar4 * const h_inputImageRGBA, uchar4 * const d_
                                              numRows,
                                              numCols);
   cudaDeviceSynchronize(); checkCudaErrors(cudaGetLastError());
-
 }
 
 
 //Free all the memory that we allocated
-//TODO: make sure you free any arrays that you allocated
+//DONE: make sure you free any arrays that you allocated
 void cleanup() {
   checkCudaErrors(cudaFree(d_red));
   checkCudaErrors(cudaFree(d_green));
   checkCudaErrors(cudaFree(d_blue));
+  checkCudaErrors(cudaFree(d_filter));
+
 }