-
Notifications
You must be signed in to change notification settings - Fork 0
/
MatrixMultiply.cu
173 lines (142 loc) · 3.84 KB
/
MatrixMultiply.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
/*
Program to multiply two matrices using the CPU and the GPU.
Benchmarking the timings to compare CPUs sequential execution to the parallel computation
capability of a GPU.
*/
/*
Note that there is a considerable dependency of the ratio of execution times of the CPU and GPU on the
hardware which is being used to execute the run the program.
*/
// Importing the required headers
#include<stdio.h>
#include<cuda.h>
#include<time.h>
// Returns the duration from start to end times in sec
double time_elapsed(struct timespec *start, struct timespec *end)
{
double t;
t = (end->tv_sec - start->tv_sec); // diff in seconds
t += (end->tv_nsec - start->tv_nsec) * 0.000000001; //diff in nanoseconds
return t;
}
// GPU Kernel
__global__ void GPU_MUL(int **a, int **b, int **c, int n)
{
int i = blockIdx.x; //each block operates on one row a/c.
int j = threadIdx.x; //each thread of a block operates on one column of the b/c.
int ans = 0;
for(int k = 0; k < n; k++) //once you know the the row of a and column of b, just multiply.
ans += a[i][k] * b[k][j];
c[i][j] = ans; //store result.
return;
}
/*
// CPU Function
void CPU_MUL(int **a, int **b, int **c, int n)
{
int ans;
for(int i = 0; i < n; i++) //General way of multiplying matrices in c.
{
for(int j = 0; j < n; j++)
{
ans = 0;
for(int k = 0; k < n; k++)
{
ans += a[i][k] * b[k][j];
}
c[i][j] = ans;
}
}
return;
}
*/
// CPU Function
void CPU_MUL(int **a, int **b, int **c, int n)
{
int ans;
for(int i = 0; i < n; i++) //initialise to zero.
{
for(int k = 0; k < n; k++)
{
c[i][k]=0;
}
}
for(int i = 0; i < n; i++) //efficient way of multiplying matrices considering memory and caches.
{
for(int k = 0; k < n; k++)
{
for(int j = 0; j < n; j++)
{
c[i][j] += a[i][k] * b[k][j];
}
}
}
return;
}
// Code execution begins here
int main()
{
struct timespec start1, end1; //variables to store time for GPU
struct timespec start2, end2; //variables to store time for CPU
int n;
printf("Enter the value of n: ");
scanf("%d", &n); //get value of n.
int **a;
int **b;
int **c;
cudaMallocManaged(&a, n*sizeof(int*)); // pointer to allocated shared memory
cudaMallocManaged(&b, n*sizeof(int*)); // pointer to allocated shared memory
cudaMallocManaged(&c, n*sizeof(int*)); // pointer to allocated shared memory
for(int i = 0; i < n; i++)
{
cudaMallocManaged(&(a[i]), n*sizeof(int)); // allocate shared memory
cudaMallocManaged(&(b[i]), n*sizeof(int)); // allocate shared memory
cudaMallocManaged(&(c[i]), n*sizeof(int)); // allocate shared memory
}
for(int i = 0; i < n; i++) // matrix A is the identity and matrix B has only 1's as its entries
for(int j = 0; j < n; j++)
{
if(i == j) a[i][j] = 1;
else a[i][j] = 0;
b[i][j] = 1;
c[i][j] = 0;
}
clock_gettime(CLOCK_REALTIME, &start1); //start timestamp
GPU_MUL<<<n, n>>>(a, b, c, n);
cudaDeviceSynchronize();
clock_gettime(CLOCK_REALTIME, &end1); //end timestamp
printf("\nResult of the GPU is :\n"); //print the result
for(int i = 0; i < n; i++)
{
for(int j = 0; j < n; j++)
{
printf("%d ", c[i][j]);
}
printf("\n");
}
clock_gettime(CLOCK_REALTIME, &start2); //start timestamp
CPU_MUL(a, b, c, n);
clock_gettime(CLOCK_REALTIME, &end2); //end timestamp
printf("\nResult of the CPU is :\n"); //print the result
for(int i = 0; i < n; i++)
{
for(int j = 0; j < n; j++)
{
printf("%d ", c[i][j]);
}
printf("\n");
}
printf("\nTime taken by GPU is: %lf\n", time_elapsed(&start1, &end1)); //print time for GPU
printf("Time taken by CPU is: %lf\n", time_elapsed(&start2, &end2)); //print time for CPU
for(int i = 0; i < n; i++) //free the allocated memory space
{
cudaFree(a[i]);
cudaFree(b[i]);
cudaFree(c[i]);
}
cudaFree(a); //free the pointers to the allocated memory space.
cudaFree(b);
cudaFree(c);
cudaDeviceReset();
return 0;
}