-
Notifications
You must be signed in to change notification settings - Fork 0
/
FB.cu
144 lines (119 loc) · 4.22 KB
/
FB.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#include <chrono>
#include <thread>
#include <iostream>
#include <random>
#include <cmath>
#include <atomic>
#include <stdio.h>
#include "Timer.cuh"
#include "CheckError.cuh"
#include <omp.h>
using namespace timer;
// Set PRINT to 1 for debug output
#define PRINT 0
#define FROM_debug 0
#define TO_debug 16
// Set ZERO to 1 to use Zero copy, set ZERO to 0 to use Unified Memory
#define ZERO 1
// N is later overwritten as N = N^POW, making N the size of the input array
unsigned int N = 2;
const int POW = 16;
const float MINUTES = 0.1; // Dictates the length of the benchmark, but doesn't actually follow the length
const int BLOCK_SIZE_X = 512;
const int BLOCK_SIZE_Y = 1;
__global__
void gpu_compute(int* matrix, const int N) {
int row = blockIdx.x * blockDim.x + threadIdx.x;
double fp0 = 2.0;
double fp1 = 2.0;
int mat = matrix[row];
double res;
for (int j = 0; j < 3; j++ ) {
fp0 *= float(j) + atan(tgamma(sqrt(acosh(__ddiv_ru(3.14159265359 * mat, 0.7)))));
fp1 += float(j) + tgamma(sqrt(acosh(__ddiv_ru(3.14159265359 * mat, 0.7))));
fp0 *= float(j) * atan(tgamma(sqrt(acosh(__ddiv_ru(3.14159265359 * fp1, 0.7)))));
fp1 *= float(j) / sqrt(tgamma(sqrt(acosh(__ddiv_ru(3.14159265359 * fp0, 0.7)))));
res /= sqrt(fp0 + fp1);
}
if (17 % mat == 0) matrix[row] = res; // mat cannot be 17 or 1, so this statement always evaluates to false, forcing the compiler to actually execute the code in the for cycle (and not optimize it away)
}
void fill_data(int * d_matrix_host, int N){
unsigned seed = std::chrono::system_clock::now().time_since_epoch().count();
std::default_random_engine generator(seed);
std::uniform_int_distribution<int> distribution(1, 100);
for (int i = 0; i < N; i++) {
int temp = distribution(generator);
if (temp == 17 || temp == 1) temp++;
d_matrix_host[i] = temp;
}
}
int main() {
N = (unsigned int) pow(N, POW);
int grid = N / BLOCK_SIZE_X;
// -------------------------------------------------------------------------
// DEVICE INIT
dim3 DimGrid(grid, 1, 1);
if (N % grid) DimGrid.x++;
dim3 DimBlock(BLOCK_SIZE_X, BLOCK_SIZE_Y, 1);
// -------------------------------------------------------------------------
cudaSetDeviceFlags(cudaDeviceMapHost);
Timer<HOST> TM;
Timer<HOST> TM_update;
Timer<HOST> TM_app;
// -------------------------------------------------------------------------
// DEVICE MEMORY ALLOCATION
int * d_matrix_host;
int * d_matrix;
#if ZERO
// Zero Copy Allocation
SAFE_CALL(cudaHostAlloc((void **)&d_matrix_host, N * sizeof(int), cudaHostAllocMapped));
SAFE_CALL(cudaHostGetDevicePointer((void **)&d_matrix, (void *) d_matrix_host , 0));
#else
// Unified Allocation
SAFE_CALL(cudaMallocManaged((void **)&d_matrix_host, N * sizeof(int)));
#endif
// -------------------------------------------------------------------------
// MATRIX INITILIZATION
std::cout << "Starting Initialization..." << std::endl;
TM.start();
fill_data(d_matrix_host, N);
TM.stop();
TM.print("Initialization Finished, time: ");
// -------------------------------------------------------------------------
// EXECUTION
TM_app.start();
std::cout << "Starting computation (GPU+CPU)..." << std::endl;
for (int i = 0; i < int((MINUTES*60*1000)/33.3); i++) {
TM.start();
gpu_compute << < DimGrid, DimBlock >> > (d_matrix_host, N);
#if !ZERO
CHECK_CUDA_ERROR
#endif
TM_update.start();
fill_data(d_matrix_host, N);
TM_update.stop();
#if ZERO
CHECK_CUDA_ERROR
#endif
TM.stop();
}
#if ZERO
CHECK_CUDA_ERROR
#endif
TM_app.stop();
std::cout << "AVG UPDATE: " << TM_update.total_duration()/int((MINUTES*60*1000)/33.3) << std::endl;
if (ZERO)
TM_app.print("App run time ZC: ");
else
TM_app.print("App run time UM: ");
std::cout << "AVG APP: " << TM_app.duration()/int((MINUTES*60*1000)/33.3) << std::endl;
// -------------------------------------------------------------------------
// DEVICE MEMORY DEALLOCATION
#if ZERO
SAFE_CALL(cudaFreeHost(d_matrix));
#else
SAFE_CALL(cudaFree(d_matrix_host));
#endif
// -------------------------------------------------------------------------
cudaDeviceReset();
}