-
Notifications
You must be signed in to change notification settings - Fork 0
/
AddVectors.cu
92 lines (75 loc) · 2.16 KB
/
AddVectors.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
/*
Program to add two long integer integer arrays.
Hopefully this is a better exploitation of the GPU's parallel computing capabilities.
Benchmarking timing to compare execution speeds.
*/
/*
Note that there is a considerable dependency of the ratio of execution times of the CPU and GPU on the
hardware which is being used to execute the run the program.
*/
// Importing the required headers
#include<stdio.h>
#include<cuda.h>
#include<time.h>
// Returns the duration from start to end times in sec
double time_elapsed(struct timespec *start, struct timespec *end)
{
double t;
t = (end->tv_sec - start->tv_sec); // diff in seconds
t += (end->tv_nsec - start->tv_nsec) * 0.000000001; //diff in nanoseconds
return t;
}
// GPU Kernel
__global__ void GPU_ADD(int *a, int *b, int n)
{
int id = threadIdx.x + blockDim.x * blockIdx.x;
if(id > n) return;
else b[id] += a[id];
}
/*
// GPU Kernel Varient
__global__ void GPU_ADD(int *a, int *b, int n)
{
int index = threadIdx.x;
int stride = blockDim.x;
for(int i = index; i < n; i += stride)
b[i] += a[i];
}
*/
// CPU function
void CPU_ADD(int *a, int *b, int n)
{
for(int i = 0; i < n; i++)
b[i] += a[i];
}
// Code execution begins here
int main()
{
struct timespec start1, end1; //variables to store time for GPU
struct timespec start2, end2; //variables to store time for CPU
int n = 1<<20;
int *a1, *b1, *a2, *b2;
cudaMallocManaged(&a1, n*sizeof(int));
cudaMallocManaged(&a2, n*sizeof(int));
cudaMallocManaged(&b1, n*sizeof(int));
cudaMallocManaged(&b2, n*sizeof(int));
for(int i = 0; i < n; i++)
{
a1[i] = a2[i] = 1;
b1[i] = b2[i] = 2;
}
clock_gettime(CLOCK_REALTIME, &start1); //start timestamp
GPU_ADD<<<1024, 1024>>>(a1, b1, n);
cudaDeviceSynchronize();
clock_gettime(CLOCK_REALTIME, &end1); //end timestamp
clock_gettime(CLOCK_REALTIME, &start2); //start timestamp
CPU_ADD(a2, b2, n);
clock_gettime(CLOCK_REALTIME, &end2); //end timestamp
cudaFree(a1);
cudaFree(a2);
cudaFree(b1);
cudaFree(b2);
printf("Time taken by GPU is: %lf\n", time_elapsed(&start1, &end1)); //print result for GPU
printf("Time taken by CPU is: %lf\n", time_elapsed(&start2, &end2)); //print result for CPU
return 0;
}