v 0.3, checksum, speed improvements

PawelGorny · Jan 20, 2022 · 46221a2 · 46221a2
1 parent 3755e56
commit 46221a2
Show file tree

Hide file tree

Showing 4 changed files with 136 additions and 41 deletions.
diff --git a/README.md b/README.md
@@ -64,15 +64,13 @@ Program was prepared using CUDA 11.6 - for other version manual change in VS pro
 Performance
 -----------
 One's must modify number of blocks and number of threads in each block to find the ones which are the best for his card. Number of test performed by each thread also could have impact of global performance/latency.  
-Test card: RTX3060 (-b 224 -t 512 -s 3364) checks around 1000Mkey/s for uncompressed address and around 2100 MKey/s for compressed address. 
-Example above extended to 7 missing characters was solved in 12 minutes (uncompressed starting key: 80c59cb0997ad73f7bf8621b1955caf80b304ded0a48e5b8f28c31b30a90d68ffcabd9b283); work done = 19 * 6 missing characters.
-With the same configuration of the card but for compressed address, the full range of 7 missing characters was solved in 18 minutes (compressed starting key: 8070cfa0d40309798a5bd144a396478b5b5ae3305b7413601b18758c81b73fb371a1c9d1823a).
+Test card: RTX3060 (-b 224 -t 512 -s 3364) checks around 3600 MKey/s for compressed address with missing characters in the middle and around 1300Mkey/s for other cases.
 
        
 TODO
 ----
 * code cleaning, review of hash functions
 * build configuration for Linux
-* solver for missing characters at the left side (with a known expected checksum)
 * predefined custom step (using list of possible characters)
-* reading configuration from file
+* reading configuration from file
+* build-in stride calculcater
diff --git a/WifSolverCuda/Worker.cuh b/WifSolverCuda/Worker.cuh
@@ -7,9 +7,10 @@
 #include "lib/Math.cuh"
 
 
-__global__ void kernelUncompressed(bool* buffResult, uint64_t* buffRangeStart, uint64_t* buffStride, const int threadNumberOfChecks);
-__global__ void kernelCompressed(bool* buffResult, uint64_t* buffRangeStart, uint64_t* buffStride, const int threadNumberOfChecks);
-
+__global__ void kernelUncompressed(bool* buffResult, bool* buffCollectorWork, uint64_t* buffRangeStart, uint64_t* buffStride, const int threadNumberOfChecks);
+__global__ void kernelCompressed(bool* buffResult, bool* buffCollectorWork, uint64_t* buffRangeStart, uint64_t* buffStride, const int threadNumberOfChecks);
+__global__ void kernelUncompressed(bool* buffResult, bool* buffCollectorWork, uint64_t* buffRangeStart, uint64_t* buffStride, const int threadNumberOfChecks, const uint32_t checksum);
+__global__ void kernelCompressed(bool* buffResult, bool* buffCollectorWork, uint64_t* buffRangeStart, uint64_t* buffStride, const int threadNumberOfChecks, const uint32_t checksum);
 __global__ void resultCollector(bool* buffResult, uint64_t* buffCombinedResult, const uint64_t threadNumberOfChecks);
 
 __device__ bool _checksumDoubleSha256CheckUncompressed(unsigned int checksum, beu32* d_hash, uint64_t* _start);

diff --git a/WifSolverCuda/Worker1.cu b/WifSolverCuda/Worker1.cu
@@ -1,7 +1,62 @@
 #include "Worker.cuh"
 
-//todo collector preparer _shared
-__global__ void kernelUncompressed(bool* buffResult, uint64_t* buffRangeStart, uint64_t* buffStride, const int threadNumberOfChecks) {
+__global__ void kernelUncompressed(bool* buffResult, bool* buffCollectorWork, uint64_t* buffRangeStart, uint64_t* buffStride, const int threadNumberOfChecks, const uint32_t checksum) {
+
+    uint64_t _stride[5];
+    uint64_t _start[5];
+    uint64_t _startStride[5];
+    _load(_start, buffRangeStart);
+    _load(_stride, buffStride);
+
+    int64_t tIx = (threadIdx.x + blockIdx.x * blockDim.x) * threadNumberOfChecks;
+    IMult(_startStride, _stride, tIx);
+    _add(_start, _startStride);
+    beu32 d_hash[8];
+    bool tempResult = false;
+    bool setHit = false;
+    for (uint64_t i = 0, resultIx = tIx; i < threadNumberOfChecks; i++, resultIx++) {
+        tempResult = _checksumDoubleSha256CheckUncompressed(checksum, d_hash, _start);
+        buffResult[resultIx] = tempResult;
+        _add(_start, _stride);
+        if (tempResult && !setHit) {
+            setHit = true;
+        }
+    }
+    if (setHit) {
+        buffCollectorWork[0] = true;
+    }
+}
+__global__ void kernelCompressed(bool* buffResult, bool* buffCollectorWork, uint64_t* buffRangeStart, uint64_t* buffStride, const int threadNumberOfChecks, const uint32_t checksum) {
+    uint64_t _stride[5];
+    uint64_t _start[5];
+    uint64_t _startStride[5];
+    _load(_start, buffRangeStart);
+    _load(_stride, buffStride);
+
+    int64_t tIx = (threadIdx.x + blockIdx.x * blockDim.x) * threadNumberOfChecks;
+    IMult(_startStride, _stride, tIx);
+    _add(_start, _startStride);
+    beu32 d_hash[8];
+    bool tempResult = false;
+    bool setHit = false;
+    for (uint64_t i = 0, resultIx = tIx; i < threadNumberOfChecks; i++, resultIx++) {
+        if (((_start[0] & 0xff00000000) >> 32) != 0x01) {
+            _add(_start, _stride);
+            buffResult[resultIx] = false;
+            continue;
+        }
+        tempResult = _checksumDoubleSha256CheckCompressed(checksum, d_hash, _start);
+        buffResult[resultIx] = tempResult;
+        _add(_start, _stride);
+        if (tempResult && !setHit) {
+            setHit = true;
+        }
+    }
+    if (setHit) {
+        buffCollectorWork[0] = true;
+    }
+}
+__global__ void kernelUncompressed(bool* buffResult, bool* buffCollectorWork, uint64_t* buffRangeStart, uint64_t* buffStride, const int threadNumberOfChecks) {
 	uint64_t _stride[5];
 	uint64_t _start[5];
     uint64_t _startStride[5];
@@ -12,17 +67,21 @@ __global__ void kernelUncompressed(bool* buffResult, uint64_t* buffRangeStart, u
     IMult(_startStride, _stride, tIx);
     _add(_start, _startStride);
     beu32 d_hash[8];
-
+    bool tempResult = false;
+    bool setHit = false;
 	for (uint64_t i = 0, resultIx = tIx ; i < threadNumberOfChecks; i++, resultIx++) {
-        unsigned int checksum = _start[0] & 0xffffffff;
-        buffResult[resultIx] = false;
-        if (_checksumDoubleSha256CheckUncompressed(checksum, d_hash, _start)) {
-            buffResult[resultIx] = true;
-        }		
+        tempResult = _checksumDoubleSha256CheckUncompressed(_start[0] & 0xffffffff, d_hash, _start);
+        buffResult[resultIx] = tempResult;
 		_add(_start, _stride);
+        if (tempResult && !setHit) {
+            setHit = true;
+        }
 	}
+    if (setHit) {
+        buffCollectorWork[0] = true;
+    }
 }
-__global__ void kernelCompressed(bool* buffResult, uint64_t* buffRangeStart, uint64_t* buffStride, const int threadNumberOfChecks) {
+__global__ void kernelCompressed(bool* buffResult, bool* buffCollectorWork, uint64_t* buffRangeStart, uint64_t* buffStride, const int threadNumberOfChecks) {
     uint64_t _stride[5];
     uint64_t _start[5];
     uint64_t _startStride[5];
@@ -33,19 +92,23 @@ __global__ void kernelCompressed(bool* buffResult, uint64_t* buffRangeStart, uin
     IMult(_startStride, _stride, tIx);
     _add(_start, _startStride);
     beu32 d_hash[8];
-
+    bool tempResult = false;
+    bool setHit = false;
     for (uint64_t i = 0, resultIx = tIx; i < threadNumberOfChecks; i++, resultIx++) {
         if (((_start[0] & 0xff00000000) >> 32) != 0x01) {
             _add(_start, _stride);
             buffResult[resultIx] = false;
             continue;
         }
-        unsigned int checksum = _start[0] & 0xffffffff;    
-        buffResult[resultIx] = false;
-        if (_checksumDoubleSha256CheckCompressed(checksum, d_hash, _start)) {
-            buffResult[resultIx] = true;
-        }   
+        tempResult = _checksumDoubleSha256CheckCompressed(_start[0] & 0xffffffff, d_hash, _start);
+        buffResult[resultIx] = tempResult;
         _add(_start, _stride);
+        if (tempResult && !setHit) {
+            setHit = true;
+        }
+    }
+    if (setHit) {
+        buffCollectorWork[0] = true;
     }
 }
 

diff --git a/WifSolverCuda/main.cu b/WifSolverCuda/main.cu
@@ -38,6 +38,9 @@ Int STRIDE, RANGE_START, RANGE_END;
 Int loopStride;
 Int counter;
 string TARGET_ADDRESS = "";
+Int CHECKSUM;
+bool IS_CHECKSUM = false;
+
 
 bool RESULT = false;
 bool useCollector = false;
@@ -54,7 +57,7 @@ Secp256K1* secp;
 
 int main(int argc, char** argv)
 {    
-    printf("WifSolver 0.2\n\n");
+    printf("WifSolver 0.3\n\n");
 
     if (readArgs(argc, argv)) {
         showHelp(); 
@@ -101,8 +104,8 @@ cudaError_t processCuda() {
 
     cudaStatus = cudaMalloc((void**)&dev_buffRangeStart, NB64BLOCK * sizeof(uint64_t));
     cudaStatus = cudaMalloc((void**)&dev_buffStride, NB64BLOCK * sizeof(uint64_t));
-    cudaStatus = cudaMemcpy(dev_buffStride, buffStride, NB64BLOCK * sizeof(uint64_t), cudaMemcpyHostToDevice);
-    
+    cudaStatus = cudaMemcpy(dev_buffStride, buffStride, NB64BLOCK * sizeof(uint64_t), cudaMemcpyHostToDevice);      
+
     bool* buffDeviceResult = new bool[outputSize];
     bool* dev_buffDeviceResult = new bool[outputSize];
     cudaStatus = cudaMalloc((void**)&dev_buffDeviceResult, outputSize * sizeof(bool));
@@ -114,22 +117,39 @@ cudaError_t processCuda() {
     cudaStatus = cudaMalloc((void**)&dev_buffResult, COLLECTOR_SIZE * sizeof(uint64_t));
     cudaStatus = cudaMemcpy(dev_buffResult, buffResult, COLLECTOR_SIZE * sizeof(uint64_t), cudaMemcpyHostToDevice);
 
+    bool* buffCollectorWork = new bool[1];
+    buffCollectorWork[0] = false;
+    bool* dev_buffCollectorWork = new bool[1];
+    cudaStatus = cudaMalloc((void**)&dev_buffCollectorWork, 1 * sizeof(bool));
+    cudaStatus = cudaMemcpy(dev_buffCollectorWork, buffCollectorWork, 1 * sizeof(bool), cudaMemcpyHostToDevice);
+
+    const uint32_t expectedChecksum = IS_CHECKSUM ? CHECKSUM.GetInt32() : 0;
+
     uint64_t counter = 0;
     int counterSaveFile = 0;
 
     std::chrono::steady_clock::time_point begin = std::chrono::steady_clock::now();
-
+    
     while (!RESULT && RANGE_START.IsLower(&RANGE_END)) {
 
         //prepare launch
         __Load(buffRangeStart, RANGE_START.bits64);
         cudaStatus = cudaMemcpy(dev_buffRangeStart, buffRangeStart, NB64BLOCK * sizeof(uint64_t), cudaMemcpyHostToDevice);
         //launch work
         if (COMPRESSED) {
-            kernelCompressed << <BLOCK_NUMBER, BLOCK_THREADS >> > (dev_buffDeviceResult, dev_buffRangeStart, dev_buffStride, THREAD_STEPS);
+            if (IS_CHECKSUM) {
+                kernelCompressed << <BLOCK_NUMBER, BLOCK_THREADS >> > (dev_buffDeviceResult, dev_buffCollectorWork, dev_buffRangeStart, dev_buffStride, THREAD_STEPS, expectedChecksum);
+            }else{
+                kernelCompressed << <BLOCK_NUMBER, BLOCK_THREADS >> > (dev_buffDeviceResult, dev_buffCollectorWork, dev_buffRangeStart, dev_buffStride, THREAD_STEPS);
+            }            
         }
         else {            
-            kernelUncompressed << <BLOCK_NUMBER, BLOCK_THREADS >> > (dev_buffDeviceResult, dev_buffRangeStart, dev_buffStride, THREAD_STEPS);
+            if (IS_CHECKSUM) {
+                kernelUncompressed << <BLOCK_NUMBER, BLOCK_THREADS >> > (dev_buffDeviceResult, dev_buffCollectorWork, dev_buffRangeStart, dev_buffStride, THREAD_STEPS, expectedChecksum);
+            }else{
+                kernelUncompressed << <BLOCK_NUMBER, BLOCK_THREADS >> > (dev_buffDeviceResult, dev_buffCollectorWork, dev_buffRangeStart, dev_buffStride, THREAD_STEPS);
+            }
+
         }        
         cudaStatus = cudaGetLastError();
         if (cudaStatus != cudaSuccess) {
@@ -144,8 +164,11 @@ cudaError_t processCuda() {
         //std::cout << "Time difference = " << std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - begin).count() << "[ms]" << std::endl;
 
         if (useCollector) {
-            //summarize results
-            bool anyResult = true;
+            //summarize results            
+            cudaStatus = cudaMemcpy(buffCollectorWork, dev_buffCollectorWork, 1 * sizeof(bool), cudaMemcpyDeviceToHost);
+            bool anyResult = buffCollectorWork[0];
+            buffCollectorWork[0] = false;
+            cudaStatus = cudaMemcpy(dev_buffCollectorWork, buffCollectorWork, 1 * sizeof(bool), cudaMemcpyHostToDevice);
             while (anyResult) {
                 resultCollector << <BLOCK_NUMBER, 1 >> > (dev_buffDeviceResult, dev_buffResult, THREAD_STEPS * BLOCK_THREADS);
                 cudaStatus = cudaGetLastError();
@@ -196,7 +219,7 @@ cudaError_t processCuda() {
         //std::cout << "Time difference = " << std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - begin).count() << "[ms]" << std::endl;        
         RANGE_START.Add(&loopStride);
         counter += outputSize;
-        int32_t t = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - begin).count();
+        uint64_t t = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - begin).count();
         if ( t > 5000) {
             double speed = (double)((double)counter / (t/1000) ) / 1000000.0;            
             std::string speedStr;
@@ -229,6 +252,7 @@ Error:
     cudaFree(dev_buffDeviceResult);
     cudaFree(dev_buffRangeStart);
     cudaFree(dev_buffStride);
+    cudaFree(dev_buffCollectorWork);
     return cudaStatus;
 }
 
@@ -268,15 +292,18 @@ void processCandidate(Int &toTest) {
 
 void printConfig() {
     printf("Range start: %s\n", RANGE_START.GetBase16().c_str());
-    printf( "Range end  : %s\n", RANGE_END.GetBase16().c_str());
-    printf( "Stride     : %s\n", STRIDE.GetBase16().c_str());
+    printf("Range end  : %s\n", RANGE_END.GetBase16().c_str());
+    printf("Stride     : %s\n", STRIDE.GetBase16().c_str());
     if (!TARGET_ADDRESS.empty()) {
         printf( "Target     : %s\n", TARGET_ADDRESS.c_str());
     }
     if (COMPRESSED) {
         printf("Target COMPRESSED\n");
     }    else    {
         printf("Target UNCOMPRESSED\n");
+    }   
+    if (IS_CHECKSUM) {
+        printf("Checksum   : %s\n", CHECKSUM.GetBase16().c_str());
     }
     printf( "\n");
     printf( "number of blocks: %d\n", BLOCK_NUMBER);
@@ -320,16 +347,17 @@ bool checkDevice() {
 void showHelp() {    
     printf("WifSolverCuda [-d deviceId] [-b NbBlocks] [-t NbThreads] [-s NbThreadChecks]\n");
     printf("    [-fresultp reportFile] [-fresult resultFile] [-fstatus statusFile] [-a targetAddress]\n");
-    printf("    -rangeStart hexKeyStart -rangeEnd hexKeyEnd -stride hexKeyStride\n\n");
+    printf("    -stride hexKeyStride -rangeStart hexKeyStart [-rangeEnd hexKeyEnd] [-checksum hexChecksum] \n\n");
     printf("-rangeStart hexKeyStart: decoded initial key with compression flag and checksum \n");
     printf("-rangeEnd hexKeyEnd: decoded end key with compression flag and checksum \n");
+    printf("-checksum hexChecksum: decoded checksum, cannot be modified with a stride  \n");
     printf("-stride hexKeyStride: full stride calculated as 58^(missing char index) \n");
     printf("-fresult resultFile: file for final result (default: %s)\n", fileResult.c_str());
     printf("-fresultp reportFile: file for each WIF with correct checksum (default: %s)\n", fileResultPartial.c_str());
     printf("-fstatus statusFile: file for periodically saved status (default: %s) \n", fileStatus.c_str());
     printf("-d deviceId: default 0\n");
-    printf("-c search for compressed address\n");
-    printf("-u search for uncompressed address (default)\n");
+    printf("-c : search for compressed address\n");
+    printf("-u : search for uncompressed address (default)\n");
     printf("-b NbBlocks: default processorCount * 12\n");
     printf("-t NbThreads: default deviceMax / 4\n");
     printf("-s NbThreadChecks: default 3364\n");
@@ -344,12 +372,12 @@ bool readArgs(int argc, char** argv) {
     while (a < argc) {
         if (strcmp(argv[a], "-h") == 0) {
             return true;
-        }else
-        if (strcmp(argv[a], "-d") == 0) {
+        }
+        else if (strcmp(argv[a], "-d") == 0) {
             a++;
             DEVICE_NR = strtol(argv[a], NULL, 10);
-        }else
-        if (strcmp(argv[a], "-c") == 0) {
+        }
+        else if (strcmp(argv[a], "-c") == 0) {
             COMPRESSED = true;
         }
         else if (strcmp(argv[a], "-u") == 0) {
@@ -398,6 +426,11 @@ bool readArgs(int argc, char** argv) {
             a++;
             TARGET_ADDRESS = string(argv[a]);
         }
+        else if (strcmp(argv[a], "-checksum") == 0) {
+            a++;
+            CHECKSUM.SetBase16((char*)string(argv[a]).c_str());
+            IS_CHECKSUM = true;
+        }
         a++;
     }