diff --git a/README.md b/README.md
index 6fc5199..ecb0c9a 100644
--- a/README.md
+++ b/README.md
@@ -25,7 +25,7 @@ Usage:
-fresultp reportFile: file for each WIF with correct checksum (default: result_partial.txt)
-fstatus statusFile: file for periodically saved status (default: fileStatus.txt)
-fstatusIntv seconds: period between status file updates (default 60 sec)
- -d deviceId: default 0
+ -d deviceId: default 0, '-d all' for all available CUDA devices
-c : search for compressed address
-u : search for uncompressed address (default)
-b NbBlocks: default processorCount * 8
diff --git a/WifSolverCuda/WifSolverCuda.vcxproj b/WifSolverCuda/WifSolverCuda.vcxproj
index 44bab91..1b5736d 100644
--- a/WifSolverCuda/WifSolverCuda.vcxproj
+++ b/WifSolverCuda/WifSolverCuda.vcxproj
@@ -30,7 +30,7 @@
-
+
@@ -55,7 +55,7 @@
64
- compute_86,sm_86
+ compute_75,sm_75
@@ -75,7 +75,7 @@
64
- compute_86,sm_86
+ compute_75,sm_75
@@ -112,6 +112,6 @@
-
+
\ No newline at end of file
diff --git a/WifSolverCuda/Worker.cuh b/WifSolverCuda/Worker.cuh
index 195ba2c..7fd8531 100644
--- a/WifSolverCuda/Worker.cuh
+++ b/WifSolverCuda/Worker.cuh
@@ -1,5 +1,6 @@
#include
+#include
#include
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
@@ -13,10 +14,10 @@ __global__ void kernelUncompressed(bool* buffResult, bool* buffCollectorWork, ui
__global__ void kernelCompressed(bool* buffResult, bool* buffCollectorWork, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum);
__global__ void resultCollector(bool* buffResult, uint64_t* buffCombinedResult, const uint64_t threadsInBlockNumberOfChecks);
-__global__ void kernelCompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks);
-__global__ void kernelCompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum);
-__global__ void kernelUncompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks);
-__global__ void kernelUncompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum);
+__global__ void kernelCompressed(const int gpuIx, uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks);
+__global__ void kernelCompressed(const int gpuIx, uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum);
+__global__ void kernelUncompressed(const int gpuIx, uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks);
+__global__ void kernelUncompressed(const int gpuIx, uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum);
__device__ bool _checksumDoubleSha256CheckUncompressed(unsigned int checksum, beu32* d_hash, uint64_t* _start);
__device__ bool _checksumDoubleSha256CheckCompressed(unsigned int checksum, beu32* d_hash, uint64_t* _start);
@@ -29,6 +30,6 @@ __device__ void _load(uint64_t* C, uint64_t* A);
__device__ void IMult(uint64_t* r, uint64_t* a, int64_t b);
__device__ void initShared();
-__device__ void summaryShared(uint32_t* unifiedResult, bool* isResultFlag);
+__device__ __inline__ void summaryShared(const int gpuIx, uint32_t* unifiedResult, bool* isResultFlag);
cudaError_t loadStride(uint64_t* stride);
\ No newline at end of file
diff --git a/WifSolverCuda/Worker1.cu b/WifSolverCuda/Worker1.cu
index 5431213..e76d39f 100644
--- a/WifSolverCuda/Worker1.cu
+++ b/WifSolverCuda/Worker1.cu
@@ -91,7 +91,7 @@ __global__ void resultCollector(bool* buffResult, uint64_t* buffCombinedResult,
buffCombinedResult[blockIdx.x] = 0xffffffffffff;
}
-__global__ void kernelUncompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum) {
+__global__ void kernelUncompressed(const int gpuIx, uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum) {
uint64_t _start[5];
beu32 d_hash[8];
@@ -112,9 +112,9 @@ __global__ void kernelUncompressed(uint32_t* unifiedResult, bool* isResultFlag,
}
_add(_start, _stride);
}
- summaryShared(unifiedResult, isResultFlag);
+ summaryShared(gpuIx, unifiedResult, isResultFlag);
}
-__global__ void kernelUncompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks) {
+__global__ void kernelUncompressed(const int gpuIx, uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks) {
uint64_t _start[5];
beu32 d_hash[8];
@@ -135,12 +135,11 @@ __global__ void kernelUncompressed(uint32_t* unifiedResult, bool* isResultFlag,
}
_add(_start, _stride);
}
- summaryShared(unifiedResult, isResultFlag);
+ summaryShared(gpuIx, unifiedResult, isResultFlag);
}
-__global__ void kernelCompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks) {
+__global__ void kernelCompressed(const int gpuIx, uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks) {
uint64_t _start[5];
beu32 d_hash[8];
-
int64_t resIx = threadIdx.x;
int64_t tIx = (threadIdx.x + blockIdx.x * blockDim.x) * threadNumberOfChecks;
IMult(_start, _stride, tIx);
@@ -162,9 +161,9 @@ __global__ void kernelCompressed(uint32_t* unifiedResult, bool* isResultFlag, ui
}
_add(_start, _stride);
}
- summaryShared(unifiedResult, isResultFlag);
+ summaryShared(gpuIx, unifiedResult, isResultFlag);
}
-__global__ void kernelCompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum) {
+__global__ void kernelCompressed(const int gpuIx, uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum) {
uint64_t _start[5];
beu32 d_hash[8];
@@ -189,7 +188,7 @@ __global__ void kernelCompressed(uint32_t* unifiedResult, bool* isResultFlag, ui
}
_add(_start, _stride);
}
- summaryShared(unifiedResult, isResultFlag);
+ summaryShared(gpuIx, unifiedResult, isResultFlag);
}
__device__ __inline__ void initShared() {
@@ -202,11 +201,11 @@ __device__ __inline__ void initShared() {
}
__syncthreads();
}
-__device__ __inline__ void summaryShared(uint32_t* unifiedResult, bool* isResultFlag) {
+__device__ __inline__ void summaryShared(const int gpuIx, uint32_t* unifiedResult, bool* isResultFlag) {
__syncthreads();
- if (_blockResultFlag[0] && threadIdx.x == 0) {
- isResultFlag[0] = true;
- for (int i = 0, rIx = blockIdx.x; i < blockDim.x * 4; i++) {
+ if (threadIdx.x == 0 && _blockResultFlag[0]) {
+ isResultFlag[gpuIx] = true;
+ for (int i = 0, rIx = (blockIdx.x + 4*gpuIx*gridDim.x*blockDim.x); i < blockDim.x * 4; i++) {
if (_blockResults[i] != UINT32_MAX) {
unifiedResult[rIx] = _blockResults[i];
rIx += gridDim.x;
diff --git a/WifSolverCuda/main.cu b/WifSolverCuda/main.cu
index 9880980..1aa93eb 100644
--- a/WifSolverCuda/main.cu
+++ b/WifSolverCuda/main.cu
@@ -2,13 +2,13 @@
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
-#include
#include
#include
#include
#include
#include
#include
+#include
#include "lib/Int.h"
#include "lib/Math.cuh"
@@ -31,14 +31,18 @@ void decodeWif();
void printSpeed(double speed);
void saveStatus();
void restoreSettings(string fileStatusRestore);
+cudaError_t processingUnit(const uint8_t gpuIx, uint64_t** dev_buffRangeStart, Int buffRangeStart, const uint32_t expectedChecksum, uint32_t* buffResultManaged, bool* buffIsResultManaged);
cudaError_t processCuda();
cudaError_t processCudaUnified();
-cudaError_t executeKernel(uint32_t* _buffResultManaged, bool* _buffIsResultManaged, uint64_t* const _dev_buffRangeStart, const uint32_t _checksum);
+cudaError_t processCudaUnifiedMulti();
+cudaError_t executeKernel(uint32_t* _buffResultManaged, bool* _buffIsResultManaged, uint64_t* const _dev_buffRangeStart, const uint32_t _checksum, const int gpuIx);
bool unifiedMemory = true;
+const size_t RANGE_TRANSFER_SIZE = NB64BLOCK * sizeof(uint64_t);
int DEVICE_NR = 0;
+int nDevices;
unsigned int BLOCK_THREADS = 0;
unsigned int BLOCK_NUMBER = 0;
unsigned int THREAD_STEPS = 5000;
@@ -79,9 +83,10 @@ bool isVerbose = false;
Secp256K1* secp;
+
int main(int argc, char** argv)
{
- printf("WifSolver 0.5.5\n\n");
+ printf("WifSolver 0.6.0\n\n");
printf("Use parameter '-h' for help and list of available parameters\n\n");
if (argc <=1 || readArgs(argc, argv)) {
@@ -128,7 +133,12 @@ int main(int argc, char** argv)
cudaError_t cudaStatus;
if (unifiedMemory) {
- cudaStatus = processCudaUnified();
+ if (DEVICE_NR == -1) {
+ cudaStatus = processCudaUnifiedMulti();
+ }
+ else {
+ cudaStatus = processCudaUnified();
+ }
}
else {
cudaStatus = processCuda();
@@ -148,13 +158,129 @@ int main(int argc, char** argv)
return 0;
}
+cudaError_t processCudaUnifiedMulti() {
+ cudaError_t cudaStatus;
+
+ uint64_t* buffStride = new uint64_t[NB64BLOCK];
+ __Load(buffStride, STRIDE.bits64);
+ for (int i = 0; i < nDevices; i++) {
+ cudaSetDevice(i);
+ loadStride(buffStride);
+ }
+ delete buffStride;
+
+ const int COLLECTOR_SIZE_MM_PER_GPU = 4 * BLOCK_NUMBER * BLOCK_THREADS;
+ const int COLLECTOR_SIZE_MM = COLLECTOR_SIZE_MM_PER_GPU * nDevices;
+ const uint32_t expectedChecksum = IS_CHECKSUM ? CHECKSUM.GetInt32() : 0;
+ uint64_t counter = 0;
+
+ uint32_t* buffResultManaged = new uint32_t[COLLECTOR_SIZE_MM];
+ cudaStatus = cudaMallocManaged(&buffResultManaged, COLLECTOR_SIZE_MM * sizeof(uint32_t));
+ if (cudaStatus != cudaSuccess) {
+ fprintf(stderr, "cudaMallocManaged failed 'buffResultManaged': %s\n", cudaGetErrorString(cudaStatus));
+ }
+ for (int i = 0; i < COLLECTOR_SIZE_MM; i++) {
+ buffResultManaged[i] = UINT32_MAX;
+ }
+ bool* buffIsResultManaged = new bool[nDevices];
+ cudaStatus = cudaMallocManaged(&buffIsResultManaged, nDevices * sizeof(bool));
+ for (int gpuIx = 0; gpuIx < nDevices; gpuIx++) {
+ buffIsResultManaged[gpuIx] = false;
+ }
+
+ uint64_t* buffRangeStart = new uint64_t[NB64BLOCK];
+ uint64_t** dev_buffRangeStart = (uint64_t**)malloc(sizeof(uint64_t*) * nDevices);
+ for (int gpuIx = 0; gpuIx < nDevices; gpuIx++) {
+ cudaSetDevice(gpuIx);
+ cudaStatus = cudaMalloc((void**)&dev_buffRangeStart[gpuIx], NB64BLOCK * sizeof(uint64_t));
+ if (cudaStatus != cudaSuccess) {
+ fprintf(stderr, "%d:: cudaMallocManaged failed 'buffResultManaged': %s\n", gpuIx, cudaGetErrorString(cudaStatus));
+ }
+ }
+
+ std::thread* threads = new std::thread[nDevices];
+
+ std::chrono::steady_clock::time_point beginCountHashrate = std::chrono::steady_clock::now();
+ std::chrono::steady_clock::time_point beginCountStatus = std::chrono::steady_clock::now();
+
+ while (!RESULT && RANGE_START.IsLower(&RANGE_END)) {
+ Int rangeTestStart = new Int(&RANGE_START);
+ //__Load(buffRangeStart, RANGE_START.bits64);
+ for (int gpuIx = 0; gpuIx < nDevices; gpuIx++) {
+ threads[gpuIx] = thread(processingUnit, gpuIx, dev_buffRangeStart, rangeTestStart, expectedChecksum, buffResultManaged, buffIsResultManaged);
+ }
+
+ long long tHash = std::chrono::duration_cast(std::chrono::steady_clock::now() - beginCountHashrate).count();
+ if (tHash >= 5000) {
+ printSpeed((double)((double)counter / tHash) / 1000.0);
+ counter = 0;
+ beginCountHashrate = std::chrono::steady_clock::now();
+ if (std::chrono::duration_cast(std::chrono::steady_clock::now() - beginCountStatus).count() >= fileStatusInterval) {
+ saveStatus();
+ beginCountStatus = std::chrono::steady_clock::now();
+ }
+ }
+ counter += outputSize * nDevices;
+ for (uint8_t gpuIx = 0; gpuIx < nDevices; gpuIx++) {
+ threads[gpuIx].join();
+ }
+
+ for (uint8_t gpuIx = 0; gpuIx < nDevices; gpuIx++) {
+ //test result, to be moved to separate thread
+ if (buffIsResultManaged[gpuIx]) {
+ buffIsResultManaged[gpuIx] = false;
+ for (int i = COLLECTOR_SIZE_MM_PER_GPU *gpuIx, ix=0; ix < COLLECTOR_SIZE_MM_PER_GPU && !RESULT; i++, ix++) {
+ if (buffResultManaged[i] != UINT32_MAX) {
+ Int toTest = new Int(&rangeTestStart);
+ Int diff = new Int(&STRIDE);
+ diff.Mult(buffResultManaged[i]);
+ toTest.Add(&diff);
+ processCandidate(toTest);
+ buffResultManaged[i] = UINT32_MAX;
+ }
+ }
+ }//test
+ rangeTestStart.Add(&loopStride);
+ RANGE_START.Add(&loopStride);
+ }
+ }
+ return cudaStatus;
+}
+
+cudaError_t processingUnit(const uint8_t gpuIx, uint64_t** dev_buffRangeStart, Int rangeStart, const uint32_t expectedChecksum, uint32_t* buffResultManaged, bool* buffIsResultManaged) {
+ cudaSetDevice(gpuIx);
+ Int tempStart = new Int(&rangeStart);
+ if (gpuIx > 0) {
+ Int m = new Int((uint64_t)gpuIx);
+ m.Mult(&loopStride);
+ tempStart.Add(&m);
+ }
+ uint64_t* tmpBufferStart = new uint64_t[NBBLOCK];
+ __Load(tmpBufferStart, tempStart.bits64);
+ cudaError_t cudaStatus = cudaMemcpy(dev_buffRangeStart[gpuIx], tmpBufferStart, RANGE_TRANSFER_SIZE, cudaMemcpyHostToDevice);
+ if (cudaStatus != cudaSuccess) {
+ fprintf(stderr, "%d:: cudaMemcpy failed 'dev_buffRangeStart': %s\n", gpuIx, cudaGetErrorString(cudaStatus));
+ return cudaStatus;
+ }
+ cudaStatus = executeKernel(buffResultManaged, buffIsResultManaged, dev_buffRangeStart[gpuIx], expectedChecksum, gpuIx);
+ if (cudaStatus != cudaSuccess) {
+ fprintf(stderr, "%d:: kernel launch failed: %s\n", gpuIx, cudaGetErrorString(cudaStatus));
+ return cudaStatus;
+ }
+ cudaStatus = cudaDeviceSynchronize();
+ if (cudaStatus != cudaSuccess) {
+ fprintf(stderr, "%d:: cudaDeviceSynchronize returned error code %d after launching kernel!\n", gpuIx, cudaStatus);
+ return cudaStatus;
+ }
+ return cudaStatus;
+}
+
cudaError_t processCudaUnified() {
cudaError_t cudaStatus;
uint64_t* buffRangeStart = new uint64_t[NB64BLOCK];
uint64_t* dev_buffRangeStart = new uint64_t[NB64BLOCK];
uint64_t* buffStride = new uint64_t[NB64BLOCK];
- const size_t RANGE_TRANSFER_SIZE = NB64BLOCK * sizeof(uint64_t);
const int COLLECTOR_SIZE_MM = 4 * BLOCK_NUMBER * BLOCK_THREADS;
const uint32_t expectedChecksum = IS_CHECKSUM ? CHECKSUM.GetInt32() : 0;
uint64_t counter = 0;
@@ -183,7 +309,7 @@ cudaError_t processCudaUnified() {
while (!RESULT && RANGE_START.IsLower(&RANGE_END)) {
//launch work
- cudaStatus = executeKernel(buffResultManaged, buffIsResultManaged, dev_buffRangeStart, expectedChecksum);
+ cudaStatus = executeKernel(buffResultManaged, buffIsResultManaged, dev_buffRangeStart, expectedChecksum, 0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "kernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
@@ -240,21 +366,21 @@ Error:
return cudaStatus;
}
-cudaError_t executeKernel(uint32_t* _buffResultManaged, bool* _buffIsResultManaged, uint64_t* const _dev_buffRangeStart, const uint32_t _checksum) {
+cudaError_t executeKernel(uint32_t* _buffResultManaged, bool* _buffIsResultManaged, uint64_t* const _dev_buffRangeStart, const uint32_t _checksum, const int gpuIx) {
if (COMPRESSED) {
if (IS_CHECKSUM) {
- kernelCompressed << > > (_buffResultManaged, _buffIsResultManaged, _dev_buffRangeStart, THREAD_STEPS, _checksum);
+ kernelCompressed << > > (gpuIx, _buffResultManaged, _buffIsResultManaged, _dev_buffRangeStart, THREAD_STEPS, _checksum);
}
else {
- kernelCompressed << > > (_buffResultManaged, _buffIsResultManaged, _dev_buffRangeStart, THREAD_STEPS);
+ kernelCompressed << > > (gpuIx, _buffResultManaged, _buffIsResultManaged, _dev_buffRangeStart, THREAD_STEPS);
}
}
else {
if (IS_CHECKSUM) {
- kernelUncompressed << > > (_buffResultManaged, _buffIsResultManaged, _dev_buffRangeStart, THREAD_STEPS, _checksum);
+ kernelUncompressed << > > (gpuIx, _buffResultManaged, _buffIsResultManaged, _dev_buffRangeStart, THREAD_STEPS, _checksum);
}
else {
- kernelUncompressed << > > (_buffResultManaged, _buffIsResultManaged, _dev_buffRangeStart, THREAD_STEPS);
+ kernelUncompressed << > > (gpuIx, _buffResultManaged, _buffIsResultManaged, _dev_buffRangeStart, THREAD_STEPS);
}
}
return cudaGetLastError();
@@ -664,32 +790,59 @@ void printFooter() {
}
bool checkDevice() {
- cudaError_t cudaStatus = cudaSetDevice(DEVICE_NR);
- if (cudaStatus != cudaSuccess) {
- fprintf(stderr, "device %d failed!", DEVICE_NR);
- return false;
- }
- else {
- cudaDeviceProp props;
- cudaStatus = cudaGetDeviceProperties(&props, DEVICE_NR);
- printf("Using GPU nr %d:\n", DEVICE_NR);
- if (props.canMapHostMemory == 0) {
- printf("unified memory not supported\n");
- unifiedMemory = 0;
- }
- printf("%s (%2d procs)\n", props.name, props.multiProcessorCount);
- printf("maxThreadsPerBlock: %2d\n\n", props.maxThreadsPerBlock);
- if (BLOCK_NUMBER == 0) {
- BLOCK_NUMBER = props.multiProcessorCount * 8;
- }
- if (BLOCK_THREADS == 0) {
- BLOCK_THREADS = (props.maxThreadsPerBlock / 8) * 5;
+ if (DEVICE_NR == -1) {
+ cudaGetDeviceCount(&nDevices);
+ //nDevices = 1;
+ for (int i = 0; i < nDevices; i++) {
+ cudaError_t cudaStatus = cudaSetDevice(i);
+ if (cudaStatus != cudaSuccess) {
+ fprintf(stderr, "device %d failed!", i);
+ return false;
+ }
+ cudaDeviceProp props;
+ cudaGetDeviceProperties(&props, i);
+ printf("Device Number: %d\n", i);
+ printf(" %s\n", props.name);
+ if (BLOCK_NUMBER == 0) {
+ BLOCK_NUMBER = props.multiProcessorCount * 4;
+ }
+ if (BLOCK_THREADS == 0) {
+ BLOCK_THREADS = props.maxThreadsPerBlock / 8 * 3;
+ }
}
outputSize = BLOCK_NUMBER * BLOCK_THREADS * THREAD_STEPS;
loopStride = new Int(&STRIDE);
loopStride.Mult(outputSize);
+ return true;
+ }
+ else {
+ cudaError_t cudaStatus = cudaSetDevice(DEVICE_NR);
+ if (cudaStatus != cudaSuccess) {
+ fprintf(stderr, "device %d failed!", DEVICE_NR);
+ return false;
+ }
+ else {
+ cudaDeviceProp props;
+ cudaStatus = cudaGetDeviceProperties(&props, DEVICE_NR);
+ printf("Using GPU nr %d:\n", DEVICE_NR);
+ if (props.canMapHostMemory == 0) {
+ printf("unified memory not supported\n");
+ unifiedMemory = 0;
+ }
+ printf("%s (%2d procs)\n", props.name, props.multiProcessorCount);
+ printf("maxThreadsPerBlock: %2d\n\n", props.maxThreadsPerBlock);
+ if (BLOCK_NUMBER == 0) {
+ BLOCK_NUMBER = props.multiProcessorCount * 8;
+ }
+ if (BLOCK_THREADS == 0) {
+ BLOCK_THREADS = (props.maxThreadsPerBlock / 8) * 5;
+ }
+ outputSize = BLOCK_NUMBER * BLOCK_THREADS * THREAD_STEPS;
+ loopStride = new Int(&STRIDE);
+ loopStride.Mult(outputSize);
+ }
+ return true;
}
- return true;
}
void showHelp() {
@@ -713,7 +866,7 @@ void showHelp() {
printf("-fresultp reportFile: file for each WIF with correct checksum (default: %s)\n", fileResultPartial.c_str());
printf("-fstatus statusFile: file for periodically saved status (default: %s) \n", fileStatus.c_str());
printf("-fstatusIntv seconds: period between status file updates (default %d sec) \n", fileStatusInterval);
- printf("-d deviceId: default 0\n");
+ printf("-d deviceId: default 0, '-d all' for all available CUDA devices\n");
printf("-c : search for compressed address\n");
printf("-u : search for uncompressed address (default)\n");
printf("-b NbBlocks: default processorCount * 8\n");
@@ -754,7 +907,12 @@ bool readArgs(int argc, char** argv) {
}
else if (strcmp(argv[a], "-d") == 0) {
a++;
- DEVICE_NR = strtol(argv[a], NULL, 10);
+ if ("all" == string(argv[a])) {
+ DEVICE_NR = -1;
+ }
+ else {
+ DEVICE_NR = strtol(argv[a], NULL, 10);
+ }
}
else if (strcmp(argv[a], "-c") == 0) {
COMPRESSED = true;