-
Notifications
You must be signed in to change notification settings - Fork 2
/
RSA_CUDA.cpp
378 lines (332 loc) · 10.7 KB
/
RSA_CUDA.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
/* Copyright (C) 2016 Levi Barnes - All Rights Reserved
* You may use, distribute and modify this code under the
* terms of the Apache license, version 2.0, 2004.
*
* You should have received a copy of the Apache 2.0 license with
* this file. If not, please write to: [email protected].
*/
#include <stdlib.h> //for rand
#include <math.h> //for sqrt
#include <limits.h> //for INT_MAX
#include <iostream> //std::cout and std::cin
#include <fstream> //for file input
#include <string.h> //for strlen
#include <time.h> //for clock
#define MAX_STR_LEN 10000
#define BLOCKWID 128
long int prime(long int);
long int gcd(long int p, long int q);
int publickey(long int p, long int q, long int* exp, long int* mod);
int privatekey(long int p, long int q, long int pubexp, long int* exp, long int* mod);
int encrypt(long int* inmsg, long int, long int, long int* outmsg, size_t len);
int decrypt(long int* inmsg, long int, long int, long int* outmsg, size_t len);
int char2long(char* in, long int* out, bool random_salt=false);
int long2char(long int* in, char* out, bool subtract_pairs=false);
long int fastexp(long int base, long int exp, long int mod);
//These additional arguments to main allow the user to supply a file name
// from the command line
int main(int argc, char** argv) {
long int p,q, pube, pubmod, prive, privmod;
char inmsg[MAX_STR_LEN];
long int inmsg_l[MAX_STR_LEN*2];
char outmsg[MAX_STR_LEN];
long int outmsg_l[MAX_STR_LEN*2];
char decrmsg[MAX_STR_LEN];
long int decrmsg_l[MAX_STR_LEN*2];
size_t len;
clock_t encrypt_time, decrypt_time, keygen_time;
//myin will take input from a file if specified on the command line and
// from keyboard input (or piped input) if no file is specified
std::istream* myin;
if (0==argc) myin = &std::cin;
else myin = new std::ifstream(argv[1]);
#ifdef __CUDA
//Wake up the GPU
long int *dp;
cudaMalloc(&dp, sizeof(long int));
cudaFree(dp);
#endif
//Get inputs
// - two prime numbers
// - a message to be encrypted
std::cout << "ENTER A PRIME NUMBER" << std::endl;
*myin >> p;
if (prime(p))
{
std::cerr << p << " is not prime." << std::endl;
return 1;
}
std::cout << "ENTER ANOTHER PRIME NUMBER" << std::endl;
*myin >> q;
if (prime(q))
{
std::cerr << q << " is not prime." << std::endl;
return 1;
}
std::cout << "ENTER A MESSAGE (up to " << MAX_STR_LEN << " characters)." << std::endl;
myin->ignore(INT_MAX,'\n');
myin->getline(inmsg, MAX_STR_LEN);
std:: cout << inmsg << std::endl;
len = strlen(inmsg);
//Generate public and private keys from p and q
clock_t start = clock();
publickey(p,q,&pube,&pubmod);
std::cout << "public key: " << pube << ", " << pubmod << std::endl;
privatekey(p,q,pube,&prive,&privmod);
std::cout << "private key: " << prive << ", " << privmod << std::endl;
keygen_time = clock() - start;
//Encrypt, then decrypt the message
std::cout << "Original text: " << inmsg << std::endl;
//Convert to long ints
char2long(inmsg, inmsg_l,true);
//Encrypt
start = clock();
encrypt(inmsg_l, pube, pubmod, outmsg_l, len*2);
encrypt_time = clock() - start;
long2char(outmsg_l, outmsg,true);
std::cout << "Encrypted text: " << outmsg << std::endl;
//Decrypt
start = clock();
decrypt(outmsg_l, prive, privmod, decrmsg_l, len*2);
decrypt_time = clock() - start;
long2char(decrmsg_l, decrmsg,true);
std::cout << "Decrypted text: " << decrmsg << std::endl;
std::cout << "Key generation time: " << keygen_time << std::endl;
std::cout << "Encrypt time: " << encrypt_time << std::endl;
std::cout << "Decrypt time: " << decrypt_time << std::endl;
}
long int prime(long int p)
//returns zero for prime numbers
{
long int j = sqrt(p);
for (long int z=2;z<j;z++) if (0==p%z) return z;
return 0;
}
int publickey(long int p, long int q, long int *exp, long int *mod)
//Generates a public key pair
//The modulus is given by (p-1)*(q-1)
//The exponent is any integer coprime of the modulus
{
*mod = (p-1)*(q-1);
//Choose an integer near sqrt(mod)
*exp = (int)sqrt(*mod);
//Find a coprime near that number
while (1!=gcd(*exp,*mod))
{
(*exp)++;
}
*mod = p*q;
return 0;
}
int privatekey(long int p, long int q, long int pubexp, long int *exp, long int *mod)
//Generates a private key pair
//The modulus is given by (p-1)*(q-1)
//The exponent is the number, n, which satisfies (n * pubexp) % mod = 1
{
*mod = (p-1)*(q-1);
*exp = 1;
long int tmp=pubexp;
while(1!=tmp%*mod)
{
tmp+=pubexp;
tmp%=*mod; //We can exploit the fact that (a*b)%c = ((a%c)*b)%c
// to keep the numbers from getting too large
(*exp)++;
}
*mod = p*q;
return 0;
}
#ifndef __CUDA
int encrypt(long int* in, long int exp, long int mod, long int* out, size_t len)
//Encrypt an array of long ints
//exp and mod should be the public key pair
//Each number, c, is encrypted by
// c' = (c^exp)%mod
{
#pragma acc parallel loop
#pragma omp parallel for
for (int i=0; i < len; i++)
{
long int c = in[i];
#if 1
out[i] = fastexp(c, exp, mod);
#else
//This is the slow way to do exponentiation
for (int z=1;z<exp;z++)
{
c *= in[i];
c %= mod; //We can exploit the fact that (a*b)%c = ((a%c)*b)%c
// to keep the numbers from getting too large
}
out[i] = c;
#endif
}
out[len]='\0'; //Terminate with a zero
return 0;
}
int decrypt(long int* in, long int exp, long int mod, long int* out, size_t len)
//Decrypt an array of long ints
//exp and mod should be the private key pair
//Each number, c', is decrypted by
// c = (c'^exp)%mod
{
#pragma acc parallel loop
#pragma omp parallel for
for (int i=0; i < len; i++)
{
long int c = in[i];
#if 1
out[i] = fastexp(c, exp, mod);
#else
//This is the slow way to do exponentiation
for (int z=1;z<exp;z++)
{
c *= in[i];
c %= mod; //We can exploit the fact that (a*b)%c = ((a%c)*b)%c
// to keep the numbers from getting too large
}
out[i] = c;
#endif
}
out[len]='\0'; //Terminate with a zero
return 0;
}
#else //ifndef CUDA
__global__ void decrypt_kernel(long int* inout, long int exp, long int mod, size_t len)
//This is the CUDA "kernel." It will be run on each thread. A CUDA kernel is always
// of type void and specified as __global__
{
//CUDA threads are divided into "blocks." The block number is stored in blockIdx.
// The thread number within that block is stored in threadIdx. The number of
// threads per block is given by blockDim. The total number of blocks is
// gridDim.
//Here, we assign each thread one number to encrypt/decrypt. Each block gets a
// consecutive set of blockDim.x values. Once each value is encrypted, the
// thread jumps ahead by blockDim.x*gridDim.x until all values are encrypted.
for (int t = threadIdx.x + blockIdx.x*blockDim.x; t<len ;t+=blockDim.x*gridDim.x)
{
if (t<len) inout[t] = fastexp(inout[t], exp, mod);
}
}
int encrypt(long int* in, long int exp, long int mod, long int* out, size_t len)
//Encrypt an array of long ints
//exp and mod should be the public key pair
//Each number, c', is decrypted by
// c = (c'^exp)%mod
{
long int *d_inout;
//Allocate memory in the separate memory space of the GPU
cudaMalloc(&d_inout, sizeof(long int)*len);
//copy data to the GPU
cudaMemcpy(d_inout, in, sizeof(long int)*len, cudaMemcpyHostToDevice); //copy to GPU
//Launch the kernel on the GPU with 1024 threads arranged in blocks of size BLOCKWID
decrypt_kernel<<<(1024*128+BLOCKWID-1)/BLOCKWID, BLOCKWID>>> (d_inout, exp, mod, len);
//copy data back from GPU
cudaMemcpy(out, d_inout, sizeof(long int)*len, cudaMemcpyDeviceToHost); //copy from GPU
out[len]=0; //Terminate with a zero
cudaFree(d_inout);
return 0;
}
int decrypt(long int* in, long int exp, long int mod, long int* out, size_t len)
//Decrypt an array of long ints
//exp and mod should be the private key pair
//Each number, c', is decrypted by
// c = (c'^exp)%mod
{
long int *d_inout;
//Allocate memory in the separate memory space of the GPU
cudaMalloc(&d_inout, sizeof(long int)*len);
//copy data to the GPU
cudaMemcpy(d_inout, in, sizeof(long int)*len, cudaMemcpyHostToDevice); //copy to GPU
//Launch the kernel on the GPU with 1024 threads arranged in blocks of size BLOCKWID
decrypt_kernel<<<(1024*128+BLOCKWID-1)/BLOCKWID, BLOCKWID>>> (d_inout, exp, mod, len);
//copy data back from GPU
cudaMemcpy(out, d_inout, sizeof(long int)*len, cudaMemcpyDeviceToHost); //copy from GPU
out[len]=0; //Terminate with a zero
cudaFree(d_inout);
return 0;
}
#endif
#ifdef __CUDA
__device__ __host__
#endif
long int fastexp(long int base, long int exp, long int mod)
//Faster modular exponetiation
//Essentially, we express the exponent as a binary number,
// then move one bit at a time multiplying output by
// base^(2^b) in the case of a 1 bit. b, here, is the bit
// position. Rather than recompute base^(2^b) each time
// we just generate base^(2^b) by squaring base^(2^(b-1))
// which is the number used for the last bit.
// All multiplications are modular with the given
// modulus, mod.
//Note that dividing by 2 is the same as a bit shift by one.
{
long int out = 1;
while(exp>0)
{
if(1==exp%2)
{
out*=base;
out%=mod;
}
base=base*base;
base%=mod;
exp/=2;
}
return out;
}
long int gcd(long int p, long int q)
//greatest common devisor (AKA greatest common factor)
// by Euclid's method
{
if (p<q) {long int tmp=p;p=q;q=tmp;}
while (q!=0)
{
//In each step the new p is the old q and the new q is p%q
// p <- q
// q <- p%q
//The last modular remainder will be 0.
//The next to last modular remainder is the GCD.
long int tmp = q;
q = p%q;
p = tmp;
}
return p;
}
int long2char(long int* in, char* out, bool subtract_pairs)
//Converts a list of long ints to char
//Using automatic type conversion
//Useful for outputting to stdout
{
while(*in != 0 || *(in+1) != 0)
{
long int r = 0;
if (subtract_pairs)
{
r = *in++;
}
*out++ = (char)(*in++)-r;
}
*out = '\0';
return 0;
}
int char2long(char* in, long int* out, bool random_salt)
//Converts a list of chars to long ints
//Using automatic type conversion
//Useful for converting input from stdin
{
while(*in != '\0')
{
long int r = 0;
if (random_salt)
{
r = rand()%INT_MAX - INT_MAX/2;
*out++ = r;
}
*out++ = (long int)(*in++) + r;
}
*out++ = 0;
*out = 0;
return 0;
}