Skip to content

Commit

Permalink
v 0.6, multigpu
Browse files Browse the repository at this point in the history
  • Loading branch information
PawelGorny committed Sep 29, 2022
1 parent 713b768 commit 88cd469
Show file tree
Hide file tree
Showing 5 changed files with 215 additions and 57 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ Usage:
-fresultp reportFile: file for each WIF with correct checksum (default: result_partial.txt)
-fstatus statusFile: file for periodically saved status (default: fileStatus.txt)
-fstatusIntv seconds: period between status file updates (default 60 sec)
-d deviceId: default 0
-d deviceId: default 0, '-d all' for all available CUDA devices
-c : search for compressed address
-u : search for uncompressed address (default)
-b NbBlocks: default processorCount * 8
Expand Down
8 changes: 4 additions & 4 deletions WifSolverCuda/WifSolverCuda.vcxproj
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
</PropertyGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.props" />
<ImportGroup Label="ExtensionSettings">
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 11.6.props" />
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 11.7.props" />
</ImportGroup>
<ImportGroup Label="PropertySheets" Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">
<Import Project="$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props" Condition="exists('$(UserRootDir)\Microsoft.Cpp.$(Platform).user.props')" Label="LocalAppDataPlatform" />
Expand All @@ -55,7 +55,7 @@
</Link>
<CudaCompile>
<TargetMachinePlatform>64</TargetMachinePlatform>
<CodeGeneration>compute_86,sm_86</CodeGeneration>
<CodeGeneration>compute_75,sm_75</CodeGeneration>
</CudaCompile>
</ItemDefinitionGroup>
<ItemDefinitionGroup Condition="'$(Configuration)|$(Platform)'=='Release|x64'">
Expand All @@ -75,7 +75,7 @@
</Link>
<CudaCompile>
<TargetMachinePlatform>64</TargetMachinePlatform>
<CodeGeneration>compute_86,sm_86</CodeGeneration>
<CodeGeneration>compute_75,sm_75</CodeGeneration>
</CudaCompile>
</ItemDefinitionGroup>
<ItemGroup>
Expand Down Expand Up @@ -112,6 +112,6 @@
</ItemGroup>
<Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
<ImportGroup Label="ExtensionTargets">
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 11.6.targets" />
<Import Project="$(VCTargetsPath)\BuildCustomizations\CUDA 11.7.targets" />
</ImportGroup>
</Project>
11 changes: 6 additions & 5 deletions WifSolverCuda/Worker.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@

#include <stdint.h>
#include <stdio.h>
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
Expand All @@ -13,10 +14,10 @@ __global__ void kernelUncompressed(bool* buffResult, bool* buffCollectorWork, ui
__global__ void kernelCompressed(bool* buffResult, bool* buffCollectorWork, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum);
__global__ void resultCollector(bool* buffResult, uint64_t* buffCombinedResult, const uint64_t threadsInBlockNumberOfChecks);

__global__ void kernelCompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks);
__global__ void kernelCompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum);
__global__ void kernelUncompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks);
__global__ void kernelUncompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum);
__global__ void kernelCompressed(const int gpuIx, uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks);
__global__ void kernelCompressed(const int gpuIx, uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum);
__global__ void kernelUncompressed(const int gpuIx, uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks);
__global__ void kernelUncompressed(const int gpuIx, uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum);

__device__ bool _checksumDoubleSha256CheckUncompressed(unsigned int checksum, beu32* d_hash, uint64_t* _start);
__device__ bool _checksumDoubleSha256CheckCompressed(unsigned int checksum, beu32* d_hash, uint64_t* _start);
Expand All @@ -29,6 +30,6 @@ __device__ void _load(uint64_t* C, uint64_t* A);
__device__ void IMult(uint64_t* r, uint64_t* a, int64_t b);

__device__ void initShared();
__device__ void summaryShared(uint32_t* unifiedResult, bool* isResultFlag);
__device__ __inline__ void summaryShared(const int gpuIx, uint32_t* unifiedResult, bool* isResultFlag);

cudaError_t loadStride(uint64_t* stride);
25 changes: 12 additions & 13 deletions WifSolverCuda/Worker1.cu
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ __global__ void resultCollector(bool* buffResult, uint64_t* buffCombinedResult,
buffCombinedResult[blockIdx.x] = 0xffffffffffff;
}

__global__ void kernelUncompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum) {
__global__ void kernelUncompressed(const int gpuIx, uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum) {
uint64_t _start[5];
beu32 d_hash[8];

Expand All @@ -112,9 +112,9 @@ __global__ void kernelUncompressed(uint32_t* unifiedResult, bool* isResultFlag,
}
_add(_start, _stride);
}
summaryShared(unifiedResult, isResultFlag);
summaryShared(gpuIx, unifiedResult, isResultFlag);
}
__global__ void kernelUncompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks) {
__global__ void kernelUncompressed(const int gpuIx, uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks) {
uint64_t _start[5];
beu32 d_hash[8];

Expand All @@ -135,12 +135,11 @@ __global__ void kernelUncompressed(uint32_t* unifiedResult, bool* isResultFlag,
}
_add(_start, _stride);
}
summaryShared(unifiedResult, isResultFlag);
summaryShared(gpuIx, unifiedResult, isResultFlag);
}
__global__ void kernelCompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks) {
__global__ void kernelCompressed(const int gpuIx, uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks) {
uint64_t _start[5];
beu32 d_hash[8];

int64_t resIx = threadIdx.x;
int64_t tIx = (threadIdx.x + blockIdx.x * blockDim.x) * threadNumberOfChecks;
IMult(_start, _stride, tIx);
Expand All @@ -162,9 +161,9 @@ __global__ void kernelCompressed(uint32_t* unifiedResult, bool* isResultFlag, ui
}
_add(_start, _stride);
}
summaryShared(unifiedResult, isResultFlag);
summaryShared(gpuIx, unifiedResult, isResultFlag);
}
__global__ void kernelCompressed(uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum) {
__global__ void kernelCompressed(const int gpuIx, uint32_t* unifiedResult, bool* isResultFlag, uint64_t* const __restrict__ buffRangeStart, const int threadNumberOfChecks, const uint32_t checksum) {
uint64_t _start[5];
beu32 d_hash[8];

Expand All @@ -189,7 +188,7 @@ __global__ void kernelCompressed(uint32_t* unifiedResult, bool* isResultFlag, ui
}
_add(_start, _stride);
}
summaryShared(unifiedResult, isResultFlag);
summaryShared(gpuIx, unifiedResult, isResultFlag);
}

__device__ __inline__ void initShared() {
Expand All @@ -202,11 +201,11 @@ __device__ __inline__ void initShared() {
}
__syncthreads();
}
__device__ __inline__ void summaryShared(uint32_t* unifiedResult, bool* isResultFlag) {
__device__ __inline__ void summaryShared(const int gpuIx, uint32_t* unifiedResult, bool* isResultFlag) {
__syncthreads();
if (_blockResultFlag[0] && threadIdx.x == 0) {
isResultFlag[0] = true;
for (int i = 0, rIx = blockIdx.x; i < blockDim.x * 4; i++) {
if (threadIdx.x == 0 && _blockResultFlag[0]) {
isResultFlag[gpuIx] = true;
for (int i = 0, rIx = (blockIdx.x + 4*gpuIx*gridDim.x*blockDim.x); i < blockDim.x * 4; i++) {
if (_blockResults[i] != UINT32_MAX) {
unifiedResult[rIx] = _blockResults[i];
rIx += gridDim.x;
Expand Down
Loading

0 comments on commit 88cd469

Please sign in to comment.