Skip to content

Commit

Permalink
v 0.3, checksum, speed improvements
Browse files Browse the repository at this point in the history
  • Loading branch information
PawelGorny committed Jan 20, 2022
1 parent 3755e56 commit 46221a2
Show file tree
Hide file tree
Showing 4 changed files with 136 additions and 41 deletions.
8 changes: 3 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,15 +64,13 @@ Program was prepared using CUDA 11.6 - for other version manual change in VS pro
Performance
-----------
One's must modify number of blocks and number of threads in each block to find the ones which are the best for his card. Number of test performed by each thread also could have impact of global performance/latency.
Test card: RTX3060 (-b 224 -t 512 -s 3364) checks around 1000Mkey/s for uncompressed address and around 2100 MKey/s for compressed address.
Example above extended to 7 missing characters was solved in 12 minutes (uncompressed starting key: 80c59cb0997ad73f7bf8621b1955caf80b304ded0a48e5b8f28c31b30a90d68ffcabd9b283); work done = 19 * 6 missing characters.
With the same configuration of the card but for compressed address, the full range of 7 missing characters was solved in 18 minutes (compressed starting key: 8070cfa0d40309798a5bd144a396478b5b5ae3305b7413601b18758c81b73fb371a1c9d1823a).
Test card: RTX3060 (-b 224 -t 512 -s 3364) checks around 3600 MKey/s for compressed address with missing characters in the middle and around 1300Mkey/s for other cases.

TODO
----
* code cleaning, review of hash functions
* build configuration for Linux
* solver for missing characters at the left side (with a known expected checksum)
* predefined custom step (using list of possible characters)
* reading configuration from file
* reading configuration from file
* build-in stride calculcater
7 changes: 4 additions & 3 deletions WifSolverCuda/Worker.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@
#include "lib/Math.cuh"


__global__ void kernelUncompressed(bool* buffResult, uint64_t* buffRangeStart, uint64_t* buffStride, const int threadNumberOfChecks);
__global__ void kernelCompressed(bool* buffResult, uint64_t* buffRangeStart, uint64_t* buffStride, const int threadNumberOfChecks);

__global__ void kernelUncompressed(bool* buffResult, bool* buffCollectorWork, uint64_t* buffRangeStart, uint64_t* buffStride, const int threadNumberOfChecks);
__global__ void kernelCompressed(bool* buffResult, bool* buffCollectorWork, uint64_t* buffRangeStart, uint64_t* buffStride, const int threadNumberOfChecks);
__global__ void kernelUncompressed(bool* buffResult, bool* buffCollectorWork, uint64_t* buffRangeStart, uint64_t* buffStride, const int threadNumberOfChecks, const uint32_t checksum);
__global__ void kernelCompressed(bool* buffResult, bool* buffCollectorWork, uint64_t* buffRangeStart, uint64_t* buffStride, const int threadNumberOfChecks, const uint32_t checksum);
__global__ void resultCollector(bool* buffResult, uint64_t* buffCombinedResult, const uint64_t threadNumberOfChecks);

__device__ bool _checksumDoubleSha256CheckUncompressed(unsigned int checksum, beu32* d_hash, uint64_t* _start);
Expand Down
93 changes: 78 additions & 15 deletions WifSolverCuda/Worker1.cu
Original file line number Diff line number Diff line change
@@ -1,7 +1,62 @@
#include "Worker.cuh"

//todo collector preparer _shared
__global__ void kernelUncompressed(bool* buffResult, uint64_t* buffRangeStart, uint64_t* buffStride, const int threadNumberOfChecks) {
__global__ void kernelUncompressed(bool* buffResult, bool* buffCollectorWork, uint64_t* buffRangeStart, uint64_t* buffStride, const int threadNumberOfChecks, const uint32_t checksum) {

uint64_t _stride[5];
uint64_t _start[5];
uint64_t _startStride[5];
_load(_start, buffRangeStart);
_load(_stride, buffStride);

int64_t tIx = (threadIdx.x + blockIdx.x * blockDim.x) * threadNumberOfChecks;
IMult(_startStride, _stride, tIx);
_add(_start, _startStride);
beu32 d_hash[8];
bool tempResult = false;
bool setHit = false;
for (uint64_t i = 0, resultIx = tIx; i < threadNumberOfChecks; i++, resultIx++) {
tempResult = _checksumDoubleSha256CheckUncompressed(checksum, d_hash, _start);
buffResult[resultIx] = tempResult;
_add(_start, _stride);
if (tempResult && !setHit) {
setHit = true;
}
}
if (setHit) {
buffCollectorWork[0] = true;
}
}
__global__ void kernelCompressed(bool* buffResult, bool* buffCollectorWork, uint64_t* buffRangeStart, uint64_t* buffStride, const int threadNumberOfChecks, const uint32_t checksum) {
uint64_t _stride[5];
uint64_t _start[5];
uint64_t _startStride[5];
_load(_start, buffRangeStart);
_load(_stride, buffStride);

int64_t tIx = (threadIdx.x + blockIdx.x * blockDim.x) * threadNumberOfChecks;
IMult(_startStride, _stride, tIx);
_add(_start, _startStride);
beu32 d_hash[8];
bool tempResult = false;
bool setHit = false;
for (uint64_t i = 0, resultIx = tIx; i < threadNumberOfChecks; i++, resultIx++) {
if (((_start[0] & 0xff00000000) >> 32) != 0x01) {
_add(_start, _stride);
buffResult[resultIx] = false;
continue;
}
tempResult = _checksumDoubleSha256CheckCompressed(checksum, d_hash, _start);
buffResult[resultIx] = tempResult;
_add(_start, _stride);
if (tempResult && !setHit) {
setHit = true;
}
}
if (setHit) {
buffCollectorWork[0] = true;
}
}
__global__ void kernelUncompressed(bool* buffResult, bool* buffCollectorWork, uint64_t* buffRangeStart, uint64_t* buffStride, const int threadNumberOfChecks) {
uint64_t _stride[5];
uint64_t _start[5];
uint64_t _startStride[5];
Expand All @@ -12,17 +67,21 @@ __global__ void kernelUncompressed(bool* buffResult, uint64_t* buffRangeStart, u
IMult(_startStride, _stride, tIx);
_add(_start, _startStride);
beu32 d_hash[8];

bool tempResult = false;
bool setHit = false;
for (uint64_t i = 0, resultIx = tIx ; i < threadNumberOfChecks; i++, resultIx++) {
unsigned int checksum = _start[0] & 0xffffffff;
buffResult[resultIx] = false;
if (_checksumDoubleSha256CheckUncompressed(checksum, d_hash, _start)) {
buffResult[resultIx] = true;
}
tempResult = _checksumDoubleSha256CheckUncompressed(_start[0] & 0xffffffff, d_hash, _start);
buffResult[resultIx] = tempResult;
_add(_start, _stride);
if (tempResult && !setHit) {
setHit = true;
}
}
if (setHit) {
buffCollectorWork[0] = true;
}
}
__global__ void kernelCompressed(bool* buffResult, uint64_t* buffRangeStart, uint64_t* buffStride, const int threadNumberOfChecks) {
__global__ void kernelCompressed(bool* buffResult, bool* buffCollectorWork, uint64_t* buffRangeStart, uint64_t* buffStride, const int threadNumberOfChecks) {
uint64_t _stride[5];
uint64_t _start[5];
uint64_t _startStride[5];
Expand All @@ -33,19 +92,23 @@ __global__ void kernelCompressed(bool* buffResult, uint64_t* buffRangeStart, uin
IMult(_startStride, _stride, tIx);
_add(_start, _startStride);
beu32 d_hash[8];

bool tempResult = false;
bool setHit = false;
for (uint64_t i = 0, resultIx = tIx; i < threadNumberOfChecks; i++, resultIx++) {
if (((_start[0] & 0xff00000000) >> 32) != 0x01) {
_add(_start, _stride);
buffResult[resultIx] = false;
continue;
}
unsigned int checksum = _start[0] & 0xffffffff;
buffResult[resultIx] = false;
if (_checksumDoubleSha256CheckCompressed(checksum, d_hash, _start)) {
buffResult[resultIx] = true;
}
tempResult = _checksumDoubleSha256CheckCompressed(_start[0] & 0xffffffff, d_hash, _start);
buffResult[resultIx] = tempResult;
_add(_start, _stride);
if (tempResult && !setHit) {
setHit = true;
}
}
if (setHit) {
buffCollectorWork[0] = true;
}
}

Expand Down
69 changes: 51 additions & 18 deletions WifSolverCuda/main.cu
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@ Int STRIDE, RANGE_START, RANGE_END;
Int loopStride;
Int counter;
string TARGET_ADDRESS = "";
Int CHECKSUM;
bool IS_CHECKSUM = false;


bool RESULT = false;
bool useCollector = false;
Expand All @@ -54,7 +57,7 @@ Secp256K1* secp;

int main(int argc, char** argv)
{
printf("WifSolver 0.2\n\n");
printf("WifSolver 0.3\n\n");

if (readArgs(argc, argv)) {
showHelp();
Expand Down Expand Up @@ -101,8 +104,8 @@ cudaError_t processCuda() {

cudaStatus = cudaMalloc((void**)&dev_buffRangeStart, NB64BLOCK * sizeof(uint64_t));
cudaStatus = cudaMalloc((void**)&dev_buffStride, NB64BLOCK * sizeof(uint64_t));
cudaStatus = cudaMemcpy(dev_buffStride, buffStride, NB64BLOCK * sizeof(uint64_t), cudaMemcpyHostToDevice);
cudaStatus = cudaMemcpy(dev_buffStride, buffStride, NB64BLOCK * sizeof(uint64_t), cudaMemcpyHostToDevice);

bool* buffDeviceResult = new bool[outputSize];
bool* dev_buffDeviceResult = new bool[outputSize];
cudaStatus = cudaMalloc((void**)&dev_buffDeviceResult, outputSize * sizeof(bool));
Expand All @@ -114,22 +117,39 @@ cudaError_t processCuda() {
cudaStatus = cudaMalloc((void**)&dev_buffResult, COLLECTOR_SIZE * sizeof(uint64_t));
cudaStatus = cudaMemcpy(dev_buffResult, buffResult, COLLECTOR_SIZE * sizeof(uint64_t), cudaMemcpyHostToDevice);

bool* buffCollectorWork = new bool[1];
buffCollectorWork[0] = false;
bool* dev_buffCollectorWork = new bool[1];
cudaStatus = cudaMalloc((void**)&dev_buffCollectorWork, 1 * sizeof(bool));
cudaStatus = cudaMemcpy(dev_buffCollectorWork, buffCollectorWork, 1 * sizeof(bool), cudaMemcpyHostToDevice);

const uint32_t expectedChecksum = IS_CHECKSUM ? CHECKSUM.GetInt32() : 0;

uint64_t counter = 0;
int counterSaveFile = 0;

std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();

while (!RESULT && RANGE_START.IsLower(&RANGE_END)) {

//prepare launch
__Load(buffRangeStart, RANGE_START.bits64);
cudaStatus = cudaMemcpy(dev_buffRangeStart, buffRangeStart, NB64BLOCK * sizeof(uint64_t), cudaMemcpyHostToDevice);
//launch work
if (COMPRESSED) {
kernelCompressed << <BLOCK_NUMBER, BLOCK_THREADS >> > (dev_buffDeviceResult, dev_buffRangeStart, dev_buffStride, THREAD_STEPS);
if (IS_CHECKSUM) {
kernelCompressed << <BLOCK_NUMBER, BLOCK_THREADS >> > (dev_buffDeviceResult, dev_buffCollectorWork, dev_buffRangeStart, dev_buffStride, THREAD_STEPS, expectedChecksum);
}else{
kernelCompressed << <BLOCK_NUMBER, BLOCK_THREADS >> > (dev_buffDeviceResult, dev_buffCollectorWork, dev_buffRangeStart, dev_buffStride, THREAD_STEPS);
}
}
else {
kernelUncompressed << <BLOCK_NUMBER, BLOCK_THREADS >> > (dev_buffDeviceResult, dev_buffRangeStart, dev_buffStride, THREAD_STEPS);
if (IS_CHECKSUM) {
kernelUncompressed << <BLOCK_NUMBER, BLOCK_THREADS >> > (dev_buffDeviceResult, dev_buffCollectorWork, dev_buffRangeStart, dev_buffStride, THREAD_STEPS, expectedChecksum);
}else{
kernelUncompressed << <BLOCK_NUMBER, BLOCK_THREADS >> > (dev_buffDeviceResult, dev_buffCollectorWork, dev_buffRangeStart, dev_buffStride, THREAD_STEPS);
}

}
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
Expand All @@ -144,8 +164,11 @@ cudaError_t processCuda() {
//std::cout << "Time difference = " << std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - begin).count() << "[ms]" << std::endl;

if (useCollector) {
//summarize results
bool anyResult = true;
//summarize results
cudaStatus = cudaMemcpy(buffCollectorWork, dev_buffCollectorWork, 1 * sizeof(bool), cudaMemcpyDeviceToHost);
bool anyResult = buffCollectorWork[0];
buffCollectorWork[0] = false;
cudaStatus = cudaMemcpy(dev_buffCollectorWork, buffCollectorWork, 1 * sizeof(bool), cudaMemcpyHostToDevice);
while (anyResult) {
resultCollector << <BLOCK_NUMBER, 1 >> > (dev_buffDeviceResult, dev_buffResult, THREAD_STEPS * BLOCK_THREADS);
cudaStatus = cudaGetLastError();
Expand Down Expand Up @@ -196,7 +219,7 @@ cudaError_t processCuda() {
//std::cout << "Time difference = " << std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - begin).count() << "[ms]" << std::endl;
RANGE_START.Add(&loopStride);
counter += outputSize;
int32_t t = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - begin).count();
uint64_t t = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - begin).count();
if ( t > 5000) {
double speed = (double)((double)counter / (t/1000) ) / 1000000.0;
std::string speedStr;
Expand Down Expand Up @@ -229,6 +252,7 @@ Error:
cudaFree(dev_buffDeviceResult);
cudaFree(dev_buffRangeStart);
cudaFree(dev_buffStride);
cudaFree(dev_buffCollectorWork);
return cudaStatus;
}

Expand Down Expand Up @@ -268,15 +292,18 @@ void processCandidate(Int &toTest) {

void printConfig() {
printf("Range start: %s\n", RANGE_START.GetBase16().c_str());
printf( "Range end : %s\n", RANGE_END.GetBase16().c_str());
printf( "Stride : %s\n", STRIDE.GetBase16().c_str());
printf("Range end : %s\n", RANGE_END.GetBase16().c_str());
printf("Stride : %s\n", STRIDE.GetBase16().c_str());
if (!TARGET_ADDRESS.empty()) {
printf( "Target : %s\n", TARGET_ADDRESS.c_str());
}
if (COMPRESSED) {
printf("Target COMPRESSED\n");
} else {
printf("Target UNCOMPRESSED\n");
}
if (IS_CHECKSUM) {
printf("Checksum : %s\n", CHECKSUM.GetBase16().c_str());
}
printf( "\n");
printf( "number of blocks: %d\n", BLOCK_NUMBER);
Expand Down Expand Up @@ -320,16 +347,17 @@ bool checkDevice() {
void showHelp() {
printf("WifSolverCuda [-d deviceId] [-b NbBlocks] [-t NbThreads] [-s NbThreadChecks]\n");
printf(" [-fresultp reportFile] [-fresult resultFile] [-fstatus statusFile] [-a targetAddress]\n");
printf(" -rangeStart hexKeyStart -rangeEnd hexKeyEnd -stride hexKeyStride\n\n");
printf(" -stride hexKeyStride -rangeStart hexKeyStart [-rangeEnd hexKeyEnd] [-checksum hexChecksum] \n\n");
printf("-rangeStart hexKeyStart: decoded initial key with compression flag and checksum \n");
printf("-rangeEnd hexKeyEnd: decoded end key with compression flag and checksum \n");
printf("-checksum hexChecksum: decoded checksum, cannot be modified with a stride \n");
printf("-stride hexKeyStride: full stride calculated as 58^(missing char index) \n");
printf("-fresult resultFile: file for final result (default: %s)\n", fileResult.c_str());
printf("-fresultp reportFile: file for each WIF with correct checksum (default: %s)\n", fileResultPartial.c_str());
printf("-fstatus statusFile: file for periodically saved status (default: %s) \n", fileStatus.c_str());
printf("-d deviceId: default 0\n");
printf("-c search for compressed address\n");
printf("-u search for uncompressed address (default)\n");
printf("-c : search for compressed address\n");
printf("-u : search for uncompressed address (default)\n");
printf("-b NbBlocks: default processorCount * 12\n");
printf("-t NbThreads: default deviceMax / 4\n");
printf("-s NbThreadChecks: default 3364\n");
Expand All @@ -344,12 +372,12 @@ bool readArgs(int argc, char** argv) {
while (a < argc) {
if (strcmp(argv[a], "-h") == 0) {
return true;
}else
if (strcmp(argv[a], "-d") == 0) {
}
else if (strcmp(argv[a], "-d") == 0) {
a++;
DEVICE_NR = strtol(argv[a], NULL, 10);
}else
if (strcmp(argv[a], "-c") == 0) {
}
else if (strcmp(argv[a], "-c") == 0) {
COMPRESSED = true;
}
else if (strcmp(argv[a], "-u") == 0) {
Expand Down Expand Up @@ -398,6 +426,11 @@ bool readArgs(int argc, char** argv) {
a++;
TARGET_ADDRESS = string(argv[a]);
}
else if (strcmp(argv[a], "-checksum") == 0) {
a++;
CHECKSUM.SetBase16((char*)string(argv[a]).c_str());
IS_CHECKSUM = true;
}
a++;
}

Expand Down

0 comments on commit 46221a2

Please sign in to comment.