forked from uestc-lsu/CoSaMP
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcosampKernel.cl
347 lines (311 loc) · 8.8 KB
/
cosampKernel.cl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
#pragma OPENCL EXTENSION cl_khr_fp64:enable
#define TS 32
#define TSDK 32
#define WPTM 2
#define WPTN 2
#define RTSM (TS/WPTM)
#define RTSN (TS/WPTN)
//matrix transpose
__kernel void transpose(const int m,
const int n,
__global const double* restrict input,
int ldb,
__global double* restrict output,
int ldbt)
{
int tx=get_local_id(0);
int ty=get_local_id(1);
int bx=get_group_id(0);
int by=get_group_id(1);
int BLOCK_SIZE=get_local_size(0);
__local double buffer[16][17];
int row=by*BLOCK_SIZE+ty;
int col=bx*BLOCK_SIZE+tx;
if(row<m && col<n)
buffer[ty][tx]=input[row*ldb+col];
barrier(CLK_LOCAL_MEM_FENCE);
int newRow=bx*BLOCK_SIZE+ty;
int newCol=by*BLOCK_SIZE+tx;
if(newRow<n && newCol<m)
output[newRow*ldbt+newCol]=buffer[tx][ty];
}
//vector norm
double impl_norm(
__global const double* restrict vec,
int start,
int size,
__local double* restrict tmp_buffer)
{
double tmp=0;
double vec_entry=0;
for(unsigned int i=get_local_id(0); i<size; i+=get_local_size(0))
{
vec_entry=vec[i+start];
tmp+=vec_entry*vec_entry;
}
tmp_buffer[get_local_id(0)]=tmp;
for(unsigned int stride=get_local_size(0)/2; stride>0; stride/=2)
{
barrier(CLK_LOCAL_MEM_FENCE);
if(get_local_id(0)<stride)
tmp_buffer[get_local_id(0)]+=tmp_buffer[get_local_id(0)+stride];
}
return tmp_buffer[0];
}
__kernel void norm(
__global const double* restrict vec,
int n,
__global double* restrict group_buffer,
__local double* restrict tmp_buffer)
{
double tmp;
tmp=impl_norm(vec, (get_group_id(0)*n)/get_num_groups(0), ((1+get_group_id(0))*n)/get_num_groups(0)-(get_group_id(0)*n)/get_num_groups(0), tmp_buffer);
if(get_local_id(0)==0)
group_buffer[get_group_id(0)]=tmp;
}
//matrix_vector multiplication
/*__kernel void mul_vec(
int m,
int n,
__global const double4* restrict A,
int lda,
__global const double4* restrict x,
__global double* restrict result,
__local double4* restrict work)
{
unsigned int row_gid=get_group_id(0);
unsigned int col_gid=get_local_id(0);
unsigned int lid=get_local_id(0);
for(unsigned int row=row_gid;row<m;row+=get_num_groups(0))
{
double4 dot_prod={0.0,0.0,0.0,0.0};
for(unsigned int col=col_gid; col<(n+3)/4; col+=get_local_size(0)){
double4 Aval=A[row*(lda/4)+col];
double4 xval=x[col];
dot_prod+=Aval*xval;
}
work[lid]=dot_prod;
barrier(CLK_LOCAL_MEM_FENCE);
for(unsigned int stride=get_local_size(0)/2; stride>0; stride>>=1)
{
if(lid<stride)
work[lid]+=work[lid+stride];
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lid==0)
result[row]=work[0].s0+work[0].s1+work[0].s2+work[0].s3;
}
}*/
__kernel void mul_vec(int m,
int n,
__global const double* restrict A,
int lda,
__global const double* restrict x,
__global double* restrict result,
__local double* restrict work)
{
unsigned int row_gid=get_group_id(0);
unsigned int col_gid=get_local_id(0);
unsigned int lid=get_local_id(0);
for(unsigned int row=row_gid;row<m;row+=get_num_groups(0)) //coarse-grained parallelism
{
double dot_prod=0.0;
for(unsigned int col=col_gid; col<n; col+=get_local_size(0)){ //fine-grained parallelism
double Aval=A[row*lda+col];
double xval=x[col];
dot_prod+=Aval*xval;
}
work[lid]=dot_prod;
barrier(CLK_LOCAL_MEM_FENCE);
for(unsigned int stride=get_local_size(0)/2; stride>0; stride>>=1) //numerical merging method
{
if(lid<stride)
work[lid]+=work[lid+stride];
barrier(CLK_LOCAL_MEM_FENCE);
}
if(lid==0)
result[row]=work[0];
}
}
//Some of the matrix line copy to another matrix
__kernel void matRowCopyMat(
const int m,
const int n,
const int p,
__global const int* restrict index,
__global const double* restrict A,
int lda,
__global double* restrict B,
int ldb)
{
unsigned int tid=get_local_id(0);
unsigned int bid=get_group_id(0);
for(unsigned int i=tid;i<n;i+=get_local_size(0))
B[bid*ldb+i]=A[index[bid]*lda+i];
barrier(CLK_GLOBAL_MEM_FENCE);
}
/*__kernel void matRowCopyMat(
const int m,
const int n,
const int p,
__global const int* restrict index,
__global const double4* restrict A,
int lda,
__global double4* restrict B,
int ldb)
{
unsigned int tid=get_local_id(0);
unsigned int bid=get_group_id(0);
for(unsigned int i=tid;i<(n+3)/4;i+=get_local_size(0))
B[bid*(ldb/4)+i]=A[index[bid]*(lda/4)+i];
barrier(CLK_GLOBAL_MEM_FENCE);
}*/
//Matrix MatrixTran multiplication
/*__kernel void MatrixMultMatrixTran(
int m,
int n,
__global const double* restrict A,
int lda,
__global double* restrict C,
int ldc)
{
int bx=get_group_id(0);
int by=get_group_id(1);
int tx=get_local_id(0);
int ty=get_local_id(1);
int row=by*TS+ty;
int col=bx*TS+tx;
__local double As[TS][TSDK];
__local double Bs[TSDK][TS];
double acc[WPT];
for(int w=0; w<WPT; w++)
acc[w]=0.0;
for(int k=0; k<(n+TSDK-1)/TSDK; k++)
{
for(int l=0;l<WPT;l++){ //load one block of matrix A and B into local memory
if(k*TSDK+tx>=n || by*TS+ty+l*RTS>=m )
As[ty+l*RTS][tx]=0;
else
As[ty+l*RTS][tx]=A[(by*TS+ty+l*RTS)*lda+k*TSDK+tx];
if(k*TSDK+tx>=n || bx*TS+ty+l*RTS>=m)
Bs[tx][ty+l*RTS]=0;
else
Bs[tx][ty+l*RTS]=A[(bx*TS+ty+l*RTS)*lda+k*TSDK+tx];
barrier(CLK_LOCAL_MEM_FENCE);
}
for(int unroll=0; unroll<TSDK; unroll++){ //calculate each point value of matrix C
for(int w=0; w<WPT; w++)
acc[w]+=As[ty+w*RTS][unroll]*Bs[unroll][tx];
}
barrier(CLK_LOCAL_MEM_FENCE);
}
for(int w=0; w<WPT; w++) //load data in private memory into global memory
if(((row+w*RTS)<m) && (col<m))
C[(row+w*RTS)*ldc+col]=acc[w];
}*/
__kernel void MatrixMultMatrixTran(
int m,
int n,
__global const double* restrict A,
int lda,
__global double* restrict C,
int ldc)
{
int bx=get_group_id(0);
int by=get_group_id(1);
int tx=get_local_id(0);
int ty=get_local_id(1);
int row=by*TS+ty;
int col=bx*TS+tx;
__local double As[TS][TSDK];
__local double Bs[TSDK][TS];
double Areg;
double Breg[WPTN];
double acc[WPTM][WPTN];
for(int wm=0; wm<WPTM; wm++){
//#pragma unroll
for(int wn=0;wn<WPTN;wn++){
acc[wm][wn]=0.0;
}
}
for(int k=0; k<(n+TSDK-1)/TSDK; k++)
{
for(int l=0; l<WPTM; l++) //load one block of matrix A and B into local memory
for(int w=0; w<WPTN; w++){
if(k*TSDK+tx+w*RTSN>=n || by*TS+ty+l*RTSM>=m)
As[ty+l*RTSM][tx+w*RTSN]=0;
else
As[ty+l*RTSM][tx+w*RTSN]=A[(by*TS+ty+l*RTSM)*lda+k*TSDK+tx+w*RTSN];
if(k*TSDK+tx+w*RTSN>=n || bx*TS+ty+l*RTSM>=m)
Bs[tx+w*RTSN][ty+l*RTSM]=0;
else
Bs[tx+w*RTSN][ty+l*RTSM]=A[(bx*TS+ty+l*RTSM)*lda+k*TSDK+tx+w*RTSN];
}
barrier(CLK_LOCAL_MEM_FENCE);
for(int t=0;t<TSDK;t++){ //calculate each point value of matrix C
for(int wm=0;wm<WPTM;wm++){
int row1=ty+wm*RTSM;
Areg=As[row1][t];
for(int wn=0;wn<WPTN;wn++){
int col1=tx+wn*RTSN;
Breg[wn]=Bs[t][col1];
acc[wm][wn]+=Areg*Breg[wn];
}
}
}
barrier(CLK_LOCAL_MEM_FENCE);
}
for(int wm=0; wm<WPTM; wm++) //load data in private memory into global memory
for(int wn=0; wn<WPTN; wn++)
if( ((row+wm*RTSM)<m) && ((col+wn*RTSN)<m) )
C[(row+wm*RTSM)*ldc+col+wn*RTSN]=acc[wm][wn];
}
//solve trigonometric function
__kernel void triangular_substitute_inplace(
int m,
int n,
__global const double* restrict A,
int lda,
__global double* restrict x,
int options)
{
double temp;
unsigned int unit_diagonal_flag=(options & (1<<0));
unsigned int transposed_access_A = (options & (1<<1));
unsigned int is_lower_solve = (options & (1<<2));
unsigned int row;
for(unsigned int rows_processed=0; rows_processed<m; ++rows_processed)
{
row=is_lower_solve ? rows_processed: ((m-rows_processed)-1);
if(!unit_diagonal_flag)
{
barrier(CLK_GLOBAL_MEM_FENCE);
if(get_global_id(0)==0)
x[row]/=A[row*lda+row];
}
barrier(CLK_GLOBAL_MEM_FENCE);
temp=x[row];
for(int elim=(is_lower_solve ? (row+get_global_id(0)+1):get_global_id(0)); elim<(is_lower_solve ? m:row ); elim+=get_global_size(0))
x[elim]-=temp*A[transposed_access_A?(row*lda+elim):(elim*lda+row)];
barrier(CLK_GLOBAL_MEM_FENCE);
}
}
//vector subtract
/*__kernel void vectorSub_kernel(
__global const double4 *x,
__global const double4 *y,
const int n,
__global double4 *result)
{
for(unsigned int i=get_global_id(0); i<(n+3)/4; i+=get_global_size(0))
result[i]=x[i]-y[i];
}*/
__kernel void vectorSub_kernel(
__global const double* restrict x,
__global const double* restrict y,
const int n,
__global double* restrict result)
{
for(unsigned int i=get_global_id(0); i<n; i+=get_global_size(0))
result[i]=x[i]-y[i];
}