diff --git a/README.md b/README.md
index 6fc5199..ecb0c9a 100644
--- a/README.md
+++ b/README.md
@@ -25,7 +25,7 @@ Usage:
      -fresultp reportFile:    file for each WIF with correct checksum (default: result_partial.txt)
      -fstatus statusFile:     file for periodically saved status (default: fileStatus.txt)
      -fstatusIntv seconds:    period between status file updates (default 60 sec)
-	 -d deviceId:             default 0
+	 -d deviceId:             default 0, '-d all' for all available CUDA devices
      -c :                     search for compressed address
      -u :                     search for uncompressed address (default)     
      -b NbBlocks:             default processorCount * 8
diff --git a/WifSolverCuda/WifSolverCuda.vcxproj b/WifSolverCuda/WifSolverCuda.vcxproj
index 44bab91..1b5736d 100644
--- a/WifSolverCuda/WifSolverCuda.vcxproj
+++ b/WifSolverCuda/WifSolverCuda.vcxproj
@@ -30,7 +30,7 @@
   </PropertyGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
   <ImportGroup Label="ExtensionSettings">
-    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 11.6.props" />
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 11.7.props" />
   </ImportGroup>
   <ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
     <Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
@@ -55,7 +55,7 @@
     </Link>
     <CudaCompile>
       <TargetMachinePlatform>64</TargetMachinePlatform>
-      <CodeGeneration>compute_86,sm_86</CodeGeneration>
+      <CodeGeneration>compute_75,sm_75</CodeGeneration>
     </CudaCompile>
   </ItemDefinitionGroup>
   <ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
@@ -75,7 +75,7 @@
     </Link>
     <CudaCompile>
       <TargetMachinePlatform>64</TargetMachinePlatform>
-      <CodeGeneration>compute_86,sm_86</CodeGeneration>
+      <CodeGeneration>compute_75,sm_75</CodeGeneration>
     </CudaCompile>
   </ItemDefinitionGroup>
   <ItemGroup>
@@ -112,6 +112,6 @@
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
-    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 11.6.targets" />
+    <Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 11.7.targets" />
   </ImportGroup>
 </Project>
\ No newline at end of file
diff --git a/WifSolverCuda/Worker.cuh b/WifSolverCuda/Worker.cuh
index 195ba2c..7fd8531 100644
--- a/WifSolverCuda/Worker.cuh
+++ b/WifSolverCuda/Worker.cuh
@@ -1,5 +1,6 @@
 
 #include <stdint.h>
+#include <stdio.h>
 #include <cuda.h>
 #include "cuda_runtime.h"
 #include "device_launch_parameters.h"
@@ -13,10 +14,10 @@ __global__ void kernelUncompressed(bool* buffResult, bool* buffCollectorWork, ui
 __global__ void kernelCompressed(bool* buffResult, bool* buffCollectorWork, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum);
 __global__ void resultCollector(bool* buffResult, uint64_t* buffCombinedResult, const uint64_t threadsInBlockNumberOfChecks);
 
-__global__ void kernelCompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks);
-__global__ void kernelCompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum);
-__global__ void kernelUncompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks);
-__global__ void kernelUncompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum);
+__global__ void kernelCompressed(const int gpuIx, uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks);
+__global__ void kernelCompressed(const int gpuIx, uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum);
+__global__ void kernelUncompressed(const int gpuIx, uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks);
+__global__ void kernelUncompressed(const int gpuIx, uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum);
 
 __device__ bool _checksumDoubleSha256CheckUncompressed(unsigned int checksum, beu32* d_hash, uint64_t* _start);
 __device__ bool _checksumDoubleSha256CheckCompressed(unsigned int checksum, beu32* d_hash, uint64_t* _start);
@@ -29,6 +30,6 @@ __device__ void _load(uint64_t* C, uint64_t* A);
 __device__ void IMult(uint64_t* r, uint64_t* a, int64_t b); 
 
 __device__ void initShared();
-__device__ void summaryShared(uint32_t* unifiedResult, bool* isResultFlag);
+__device__ __inline__ void summaryShared(const int gpuIx, uint32_t* unifiedResult, bool* isResultFlag);
 
 cudaError_t loadStride(uint64_t* stride);
\ No newline at end of file
diff --git a/WifSolverCuda/Worker1.cu b/WifSolverCuda/Worker1.cu
index 5431213..e76d39f 100644
--- a/WifSolverCuda/Worker1.cu
+++ b/WifSolverCuda/Worker1.cu
@@ -91,7 +91,7 @@ __global__ void resultCollector(bool* buffResult, uint64_t* buffCombinedResult,
     buffCombinedResult[blockIdx.x] = 0xffffffffffff;
 }
 
-__global__ void kernelUncompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum) {
+__global__ void kernelUncompressed(const int gpuIx, uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum) {
     uint64_t _start[5];
     beu32 d_hash[8];
 
@@ -112,9 +112,9 @@ __global__ void kernelUncompressed(uint32_t* unifiedResult, bool* isResultFlag,
         }
         _add(_start, _stride);
     }
-    summaryShared(unifiedResult, isResultFlag);
+    summaryShared(gpuIx, unifiedResult, isResultFlag);
 }
-__global__ void kernelUncompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks) {
+__global__ void kernelUncompressed(const int gpuIx, uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks) {
     uint64_t _start[5];
     beu32 d_hash[8];
 
@@ -135,12 +135,11 @@ __global__ void kernelUncompressed(uint32_t* unifiedResult, bool* isResultFlag,
         }
         _add(_start, _stride);
     }
-    summaryShared(unifiedResult, isResultFlag);
+    summaryShared(gpuIx, unifiedResult, isResultFlag);
 }
-__global__ void kernelCompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks) {
+__global__ void kernelCompressed(const int gpuIx, uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks) {
     uint64_t _start[5];
     beu32 d_hash[8];
-
     int64_t resIx = threadIdx.x;
     int64_t tIx = (threadIdx.x + blockIdx.x * blockDim.x) * threadNumberOfChecks;
     IMult(_start, _stride, tIx);
@@ -162,9 +161,9 @@ __global__ void kernelCompressed(uint32_t* unifiedResult, bool* isResultFlag, ui
         }
         _add(_start, _stride);
     }
-    summaryShared(unifiedResult, isResultFlag);
+    summaryShared(gpuIx, unifiedResult, isResultFlag);
 }
-__global__ void kernelCompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum) {
+__global__ void kernelCompressed(const int gpuIx, uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum) {
     uint64_t _start[5];
     beu32 d_hash[8];
 
@@ -189,7 +188,7 @@ __global__ void kernelCompressed(uint32_t* unifiedResult, bool* isResultFlag, ui
         }
         _add(_start, _stride);
     }
-    summaryShared(unifiedResult, isResultFlag);
+    summaryShared(gpuIx, unifiedResult, isResultFlag);
 }
 
 __device__ __inline__ void initShared() {
@@ -202,11 +201,11 @@ __device__ __inline__ void initShared() {
     }
     __syncthreads();
 }
-__device__ __inline__ void summaryShared(uint32_t* unifiedResult, bool* isResultFlag) {
+__device__ __inline__ void summaryShared(const int gpuIx, uint32_t* unifiedResult, bool* isResultFlag) {
     __syncthreads();
-    if (_blockResultFlag[0] && threadIdx.x == 0) {
-        isResultFlag[0] = true;
-        for (int i = 0, rIx = blockIdx.x; i < blockDim.x * 4; i++) {
+    if (threadIdx.x == 0 && _blockResultFlag[0]) {
+        isResultFlag[gpuIx] = true;
+        for (int i = 0, rIx = (blockIdx.x + 4*gpuIx*gridDim.x*blockDim.x); i < blockDim.x * 4; i++) {
             if (_blockResults[i] != UINT32_MAX) {
                 unifiedResult[rIx] = _blockResults[i];
                 rIx += gridDim.x;
diff --git a/WifSolverCuda/main.cu b/WifSolverCuda/main.cu
index 9880980..1aa93eb 100644
--- a/WifSolverCuda/main.cu
+++ b/WifSolverCuda/main.cu
@@ -2,13 +2,13 @@
 #include "cuda_runtime.h"
 #include "device_launch_parameters.h"
 
-#include <stdio.h>
 #include <stdlib.h>
 #include <stdint.h>
 #include <iostream>
 #include <iomanip>
 #include <sstream>
 #include <chrono>
+#include <thread>
 
 #include "lib/Int.h"
 #include "lib/Math.cuh"
@@ -31,14 +31,18 @@ void decodeWif();
 void printSpeed(double speed);
 void saveStatus();
 void restoreSettings(string fileStatusRestore);
+cudaError_t processingUnit(const uint8_t gpuIx, uint64_t** dev_buffRangeStart, Int buffRangeStart, const uint32_t expectedChecksum, uint32_t* buffResultManaged, bool* buffIsResultManaged);
 
 cudaError_t processCuda();
 cudaError_t processCudaUnified();
-cudaError_t executeKernel(uint32_t* _buffResultManaged, bool* _buffIsResultManaged, uint64_t* const _dev_buffRangeStart, const uint32_t _checksum);
+cudaError_t processCudaUnifiedMulti();
+cudaError_t executeKernel(uint32_t* _buffResultManaged, bool* _buffIsResultManaged, uint64_t* const _dev_buffRangeStart, const uint32_t _checksum, const int gpuIx);
 
 bool unifiedMemory = true;
+const size_t RANGE_TRANSFER_SIZE = NB64BLOCK * sizeof(uint64_t);
 
 int DEVICE_NR = 0;
+int nDevices;
 unsigned int BLOCK_THREADS = 0;
 unsigned int BLOCK_NUMBER = 0;
 unsigned int THREAD_STEPS = 5000;
@@ -79,9 +83,10 @@ bool isVerbose = false;
 Secp256K1* secp;
 
 
+
 int main(int argc, char** argv)
 {    
-    printf("WifSolver 0.5.5\n\n");
+    printf("WifSolver 0.6.0\n\n");
     printf("Use parameter '-h' for help and list of available parameters\n\n");
 
     if (argc <=1 || readArgs(argc, argv)) {
@@ -128,7 +133,12 @@ int main(int argc, char** argv)
 
     cudaError_t cudaStatus;
     if (unifiedMemory) {
-        cudaStatus = processCudaUnified();
+        if (DEVICE_NR == -1) {
+            cudaStatus = processCudaUnifiedMulti();
+        }
+        else {
+            cudaStatus = processCudaUnified();
+        }
     }
     else {
         cudaStatus = processCuda();
@@ -148,13 +158,129 @@ int main(int argc, char** argv)
     return 0;
 }
 
+cudaError_t processCudaUnifiedMulti() {
+    cudaError_t cudaStatus;
+    
+    uint64_t* buffStride = new uint64_t[NB64BLOCK];    
+    __Load(buffStride, STRIDE.bits64);
+    for (int i = 0; i < nDevices; i++) {        
+        cudaSetDevice(i);
+        loadStride(buffStride);        
+    }
+    delete buffStride;
+    
+    const int COLLECTOR_SIZE_MM_PER_GPU = 4 * BLOCK_NUMBER * BLOCK_THREADS;
+    const int COLLECTOR_SIZE_MM = COLLECTOR_SIZE_MM_PER_GPU * nDevices;
+    const uint32_t expectedChecksum = IS_CHECKSUM ? CHECKSUM.GetInt32() : 0;
+    uint64_t counter = 0;
+
+    uint32_t* buffResultManaged = new uint32_t[COLLECTOR_SIZE_MM];
+    cudaStatus = cudaMallocManaged(&buffResultManaged, COLLECTOR_SIZE_MM * sizeof(uint32_t));
+    if (cudaStatus != cudaSuccess) {
+        fprintf(stderr, "cudaMallocManaged failed 'buffResultManaged': %s\n", cudaGetErrorString(cudaStatus));
+    }
+    for (int i = 0; i < COLLECTOR_SIZE_MM; i++) {
+        buffResultManaged[i] = UINT32_MAX;
+    }
+    bool* buffIsResultManaged = new bool[nDevices];
+    cudaStatus = cudaMallocManaged(&buffIsResultManaged, nDevices * sizeof(bool));
+    for (int gpuIx = 0; gpuIx < nDevices; gpuIx++) {
+        buffIsResultManaged[gpuIx] = false;
+    }
+
+    uint64_t* buffRangeStart = new uint64_t[NB64BLOCK];
+    uint64_t** dev_buffRangeStart = (uint64_t**)malloc(sizeof(uint64_t*) * nDevices);
+    for (int gpuIx = 0; gpuIx < nDevices; gpuIx++) {
+        cudaSetDevice(gpuIx);
+        cudaStatus = cudaMalloc((void**)&dev_buffRangeStart[gpuIx], NB64BLOCK * sizeof(uint64_t));
+        if (cudaStatus != cudaSuccess) {
+            fprintf(stderr, "%d:: cudaMallocManaged failed 'buffResultManaged': %s\n", gpuIx, cudaGetErrorString(cudaStatus));
+        }
+    }
+
+    std::thread* threads = new std::thread[nDevices];
+
+    std::chrono::steady_clock::time_point beginCountHashrate = std::chrono::steady_clock::now();
+    std::chrono::steady_clock::time_point beginCountStatus = std::chrono::steady_clock::now();
+
+    while (!RESULT && RANGE_START.IsLower(&RANGE_END)) {
+        Int rangeTestStart = new Int(&RANGE_START);     
+        //__Load(buffRangeStart, RANGE_START.bits64);
+        for (int gpuIx = 0; gpuIx < nDevices; gpuIx++) {            
+            threads[gpuIx] = thread(processingUnit, gpuIx, dev_buffRangeStart, rangeTestStart, expectedChecksum, buffResultManaged, buffIsResultManaged);
+        }
+
+        long long tHash = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::steady_clock::now() - beginCountHashrate).count();
+        if (tHash >= 5000) {
+            printSpeed((double)((double)counter / tHash) / 1000.0);
+            counter = 0;
+            beginCountHashrate = std::chrono::steady_clock::now();
+            if (std::chrono::duration_cast<std::chrono::seconds>(std::chrono::steady_clock::now() - beginCountStatus).count() >= fileStatusInterval) {
+                saveStatus();
+                beginCountStatus = std::chrono::steady_clock::now();
+            }
+        }
+        counter += outputSize * nDevices;
+        for (uint8_t gpuIx = 0; gpuIx < nDevices; gpuIx++) {
+            threads[gpuIx].join();                        
+        }
+
+        for (uint8_t gpuIx = 0; gpuIx < nDevices; gpuIx++) {
+            //test result, to be moved to separate thread
+            if (buffIsResultManaged[gpuIx]) {
+                buffIsResultManaged[gpuIx] = false;
+                for (int i = COLLECTOR_SIZE_MM_PER_GPU *gpuIx, ix=0; ix < COLLECTOR_SIZE_MM_PER_GPU && !RESULT; i++, ix++) {
+                    if (buffResultManaged[i] != UINT32_MAX) {
+                        Int toTest = new Int(&rangeTestStart);
+                        Int diff = new Int(&STRIDE);
+                        diff.Mult(buffResultManaged[i]);
+                        toTest.Add(&diff);
+                        processCandidate(toTest);
+                        buffResultManaged[i] = UINT32_MAX;
+                    }
+                }
+            }//test
+            rangeTestStart.Add(&loopStride);
+            RANGE_START.Add(&loopStride);
+        }
+    }
+    return cudaStatus;
+}
+
+cudaError_t processingUnit(const uint8_t gpuIx, uint64_t** dev_buffRangeStart, Int rangeStart, const uint32_t expectedChecksum, uint32_t* buffResultManaged, bool* buffIsResultManaged) {
+    cudaSetDevice(gpuIx);
+    Int tempStart = new Int(&rangeStart);
+    if (gpuIx > 0) {
+        Int m = new Int((uint64_t)gpuIx);
+        m.Mult(&loopStride);
+        tempStart.Add(&m);        
+    }    
+    uint64_t* tmpBufferStart = new uint64_t[NBBLOCK];
+    __Load(tmpBufferStart, tempStart.bits64);
+    cudaError_t cudaStatus = cudaMemcpy(dev_buffRangeStart[gpuIx], tmpBufferStart, RANGE_TRANSFER_SIZE, cudaMemcpyHostToDevice);
+    if (cudaStatus != cudaSuccess) {
+        fprintf(stderr, "%d:: cudaMemcpy failed 'dev_buffRangeStart': %s\n", gpuIx, cudaGetErrorString(cudaStatus));
+        return cudaStatus;
+    }
+    cudaStatus = executeKernel(buffResultManaged, buffIsResultManaged, dev_buffRangeStart[gpuIx], expectedChecksum, gpuIx);
+    if (cudaStatus != cudaSuccess) {
+        fprintf(stderr, "%d:: kernel launch failed: %s\n", gpuIx, cudaGetErrorString(cudaStatus));
+        return cudaStatus;
+    }
+    cudaStatus = cudaDeviceSynchronize();
+    if (cudaStatus != cudaSuccess) {
+        fprintf(stderr, "%d:: cudaDeviceSynchronize returned error code %d after launching kernel!\n", gpuIx, cudaStatus);
+        return cudaStatus;
+    }
+    return cudaStatus;
+}
+
 cudaError_t processCudaUnified() {
     cudaError_t cudaStatus;
     uint64_t* buffRangeStart = new uint64_t[NB64BLOCK];
     uint64_t* dev_buffRangeStart = new uint64_t[NB64BLOCK];
     uint64_t* buffStride = new uint64_t[NB64BLOCK];
 
-    const size_t RANGE_TRANSFER_SIZE = NB64BLOCK * sizeof(uint64_t);
     const int COLLECTOR_SIZE_MM = 4 * BLOCK_NUMBER * BLOCK_THREADS;
     const uint32_t expectedChecksum = IS_CHECKSUM ? CHECKSUM.GetInt32() : 0;
     uint64_t counter = 0;
@@ -183,7 +309,7 @@ cudaError_t processCudaUnified() {
 
     while (!RESULT && RANGE_START.IsLower(&RANGE_END)) {
         //launch work
-        cudaStatus = executeKernel(buffResultManaged, buffIsResultManaged, dev_buffRangeStart, expectedChecksum);
+        cudaStatus = executeKernel(buffResultManaged, buffIsResultManaged, dev_buffRangeStart, expectedChecksum, 0);
         if (cudaStatus != cudaSuccess) {
             fprintf(stderr, "kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
             goto Error;
@@ -240,21 +366,21 @@ Error:
     return cudaStatus;
 }
 
-cudaError_t executeKernel(uint32_t* _buffResultManaged, bool* _buffIsResultManaged, uint64_t* const _dev_buffRangeStart, const uint32_t _checksum) {
+cudaError_t executeKernel(uint32_t* _buffResultManaged, bool* _buffIsResultManaged, uint64_t* const _dev_buffRangeStart, const uint32_t _checksum, const int gpuIx) {
     if (COMPRESSED) {
         if (IS_CHECKSUM) {
-            kernelCompressed << <BLOCK_NUMBER, BLOCK_THREADS >> > (_buffResultManaged, _buffIsResultManaged, _dev_buffRangeStart, THREAD_STEPS, _checksum);
+            kernelCompressed << <BLOCK_NUMBER, BLOCK_THREADS >> > (gpuIx, _buffResultManaged, _buffIsResultManaged, _dev_buffRangeStart, THREAD_STEPS, _checksum);
         }
         else {
-            kernelCompressed << <BLOCK_NUMBER, BLOCK_THREADS >> > (_buffResultManaged, _buffIsResultManaged, _dev_buffRangeStart, THREAD_STEPS);
+            kernelCompressed << <BLOCK_NUMBER, BLOCK_THREADS >> > (gpuIx, _buffResultManaged, _buffIsResultManaged, _dev_buffRangeStart, THREAD_STEPS);
         }
     }
     else {
         if (IS_CHECKSUM) {
-            kernelUncompressed << <BLOCK_NUMBER, BLOCK_THREADS >> > (_buffResultManaged, _buffIsResultManaged, _dev_buffRangeStart, THREAD_STEPS, _checksum);
+            kernelUncompressed << <BLOCK_NUMBER, BLOCK_THREADS >> > (gpuIx, _buffResultManaged, _buffIsResultManaged, _dev_buffRangeStart, THREAD_STEPS, _checksum);
         }
         else {
-            kernelUncompressed << <BLOCK_NUMBER, BLOCK_THREADS >> > (_buffResultManaged, _buffIsResultManaged, _dev_buffRangeStart, THREAD_STEPS);
+            kernelUncompressed << <BLOCK_NUMBER, BLOCK_THREADS >> > (gpuIx, _buffResultManaged, _buffIsResultManaged, _dev_buffRangeStart, THREAD_STEPS);
         }
     }
     return cudaGetLastError();
@@ -664,32 +790,59 @@ void printFooter() {
 }
 
 bool checkDevice() {
-    cudaError_t cudaStatus = cudaSetDevice(DEVICE_NR);
-    if (cudaStatus != cudaSuccess) {
-        fprintf(stderr, "device %d failed!", DEVICE_NR);
-        return false;
-    }
-    else {
-        cudaDeviceProp props;
-        cudaStatus = cudaGetDeviceProperties(&props, DEVICE_NR);
-        printf("Using GPU nr %d:\n", DEVICE_NR);
-        if (props.canMapHostMemory == 0) {
-            printf("unified memory not supported\n");
-            unifiedMemory = 0;
-        }
-        printf("%s (%2d procs)\n", props.name, props.multiProcessorCount);
-        printf("maxThreadsPerBlock: %2d\n\n", props.maxThreadsPerBlock);        
-        if (BLOCK_NUMBER == 0) {
-            BLOCK_NUMBER = props.multiProcessorCount * 8;
-        }
-        if (BLOCK_THREADS == 0) {
-            BLOCK_THREADS = (props.maxThreadsPerBlock / 8) * 5;
+    if (DEVICE_NR == -1) {
+        cudaGetDeviceCount(&nDevices);
+        //nDevices = 1;
+        for (int i = 0; i < nDevices; i++) {
+            cudaError_t cudaStatus = cudaSetDevice(i);
+            if (cudaStatus != cudaSuccess) {
+                fprintf(stderr, "device %d failed!", i);
+                return false;
+            }
+            cudaDeviceProp props;
+            cudaGetDeviceProperties(&props, i);
+            printf("Device Number: %d\n", i);
+            printf("  %s\n", props.name);
+            if (BLOCK_NUMBER == 0) {
+                BLOCK_NUMBER = props.multiProcessorCount * 4;
+            }
+            if (BLOCK_THREADS == 0) {
+                BLOCK_THREADS = props.maxThreadsPerBlock / 8 * 3;
+            }
         }
         outputSize = BLOCK_NUMBER * BLOCK_THREADS * THREAD_STEPS;
         loopStride = new Int(&STRIDE);
         loopStride.Mult(outputSize);
+        return true;
+    }
+    else {
+        cudaError_t cudaStatus = cudaSetDevice(DEVICE_NR);
+        if (cudaStatus != cudaSuccess) {
+            fprintf(stderr, "device %d failed!", DEVICE_NR);
+            return false;
+        }
+        else {
+            cudaDeviceProp props;
+            cudaStatus = cudaGetDeviceProperties(&props, DEVICE_NR);
+            printf("Using GPU nr %d:\n", DEVICE_NR);
+            if (props.canMapHostMemory == 0) {
+                printf("unified memory not supported\n");
+                unifiedMemory = 0;
+            }
+            printf("%s (%2d procs)\n", props.name, props.multiProcessorCount);
+            printf("maxThreadsPerBlock: %2d\n\n", props.maxThreadsPerBlock);
+            if (BLOCK_NUMBER == 0) {
+                BLOCK_NUMBER = props.multiProcessorCount * 8;
+            }
+            if (BLOCK_THREADS == 0) {
+                BLOCK_THREADS = (props.maxThreadsPerBlock / 8) * 5;
+            }
+            outputSize = BLOCK_NUMBER * BLOCK_THREADS * THREAD_STEPS;
+            loopStride = new Int(&STRIDE);
+            loopStride.Mult(outputSize);
+        }
+        return true;
     }
-    return true;
 }
 
 void showHelp() {    
@@ -713,7 +866,7 @@ void showHelp() {
     printf("-fresultp reportFile:    file for each WIF with correct checksum (default: %s)\n", fileResultPartial.c_str());
     printf("-fstatus statusFile:     file for periodically saved status (default: %s) \n", fileStatus.c_str());
     printf("-fstatusIntv seconds:    period between status file updates (default %d sec) \n", fileStatusInterval);
-    printf("-d deviceId:             default 0\n");
+    printf("-d deviceId:             default 0, '-d all' for all available CUDA devices\n");
     printf("-c :                     search for compressed address\n");
     printf("-u :                     search for uncompressed address (default)\n");
     printf("-b NbBlocks:             default processorCount * 8\n");
@@ -754,7 +907,12 @@ bool readArgs(int argc, char** argv) {
         }
         else if (strcmp(argv[a], "-d") == 0) {
             a++;
-            DEVICE_NR = strtol(argv[a], NULL, 10);
+            if ("all" == string(argv[a])) {
+                DEVICE_NR = -1;
+            }
+            else {
+                DEVICE_NR = strtol(argv[a], NULL, 10);
+            }
         }
         else if (strcmp(argv[a], "-c") == 0) {
             COMPRESSED = true;