Skip to content
This repository has been archived by the owner on Oct 7, 2024. It is now read-only.

Commit

Permalink
Fix and improve the OpenCL implementation (#14)
Browse files Browse the repository at this point in the history
Co-authored-by: Kaleinthranx <[email protected]>
  • Loading branch information
Djadih and Kaleinthranx authored Nov 7, 2023
1 parent d7ecaaf commit 864e43c
Show file tree
Hide file tree
Showing 9 changed files with 3,623 additions and 1,487 deletions.
7 changes: 6 additions & 1 deletion cmake/Hunter/config.cmake
Original file line number Diff line number Diff line change
@@ -1,2 +1,7 @@
hunter_config(CURL VERSION ${HUNTER_CURL_VERSION} CMAKE_ARGS HTTP_ONLY=ON CMAKE_USE_OPENSSL=OFF CMAKE_USE_LIBSSH2=OFF CURL_CA_PATH=none)
hunter_config(Boost VERSION 1.66.0)
hunter_config(Boost VERSION 1.70.0-p0)

hunter_config(OpenCL VERSION
URL https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v2023.04.17/OpenCL-SDK-v2023.04.17-Source.tar.gz
SHA1 aca203982e9f1cdbe71ed93ae7e0c217b1d93a37
)
4,903 changes: 3,511 additions & 1,392 deletions libethash-cl/CL/cl2.hpp → libethash-cl/CL/opencl.hpp

Large diffs are not rendered by default.

37 changes: 19 additions & 18 deletions libethash-cl/CLMiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -695,6 +695,16 @@ bool CLMiner::initDevice()
<< m_settings.globalWorkSize / m_settings.localWorkSize;
}

#ifndef __clang__
// Nvidia
if (!m_deviceDescriptor.clNvCompute.empty())
{
m_computeCapability =
m_deviceDescriptor.clNvComputeMajor * 10 + m_deviceDescriptor.clNvComputeMinor;
int maxregs = m_computeCapability >= 35 ? 72 : 63;
sprintf(m_options, "-cl-nv-maxrregcount=%d", maxregs);
}
#endif

return true;

Expand Down Expand Up @@ -726,18 +736,6 @@ bool CLMiner::initEpoch_internal()
try
{
char options[256] = {0};
#ifndef __clang__

// Nvidia
if (!m_deviceDescriptor.clNvCompute.empty())
{
m_computeCapability =
m_deviceDescriptor.clNvComputeMajor * 10 + m_deviceDescriptor.clNvComputeMinor;
int maxregs = m_computeCapability >= 35 ? 72 : 63;
sprintf(m_options, "-cl-nv-maxrregcount=%d", maxregs);
}

#endif

m_dagItems = m_epochContext.dagNumItems;

Expand Down Expand Up @@ -836,9 +834,15 @@ bool CLMiner::initEpoch_internal()
// GPU DAG buffer to kernel
m_searchKernel.setArg(2, *m_dag);

m_dagKernel.setArg(1, *m_light);
m_dagKernel.setArg(2, *m_dag);
uint32_t light_words4[4];
ProgPow::calculate_fast_mod_data(m_epochContext.lightNumItems, light_words4[0], light_words4[1], light_words4[2]);
light_words4[3] = m_epochContext.lightNumItems;

m_dagKernel.setArg(1, m_light[0]);
m_dagKernel.setArg(2, m_dag[0]);
m_dagKernel.setArg(3, -1);
m_dagKernel.setArg(4, (uint32_t)(m_epochContext.dagSize / sizeof(ethash_hash512)));
m_dagKernel.setArg(5, light_words4);

const uint32_t workItems = m_dagItems * 2; // GPU computes partial 512-bit DAG items.

Expand Down Expand Up @@ -889,13 +893,10 @@ void CLMiner::asyncCompile()

void CLMiner::compileKernel(uint64_t period_seed, cl::Program& program, cl::Kernel& searchKernel)
{
std::string code = ProgPow::getKern(period_seed, ProgPow::KERNEL_CL);
code += string(CLMiner_kernel);
std::string code = ProgPow::getKern(CLMiner_kernel, period_seed, ProgPow::KERNEL_CL);

addDefinition(code, "GROUP_SIZE", m_settings.localWorkSize);
addDefinition(code, "ACCESSES", 64);
addDefinition(code, "LIGHT_WORDS", m_epochContext.lightNumItems);
addDefinition(code, "PROGPOW_DAG_BYTES", m_epochContext.dagSize);
addDefinition(code, "PROGPOW_DAG_ELEMENTS", m_epochContext.dagNumItems / 2);

addDefinition(code, "MAX_OUTPUTS", c_maxSearchResults);
Expand Down
2 changes: 1 addition & 1 deletion libethash-cl/CLMiner.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
#define CL_HPP_CL_1_2_DEFAULT_BUILD true
#define CL_HPP_TARGET_OPENCL_VERSION 120
#define CL_HPP_MINIMUM_OPENCL_VERSION 120
#include "CL/cl2.hpp"
#include "CL/opencl.hpp"
#pragma GCC diagnostic pop

// macOS OpenCL fix:
Expand Down
57 changes: 31 additions & 26 deletions libethash-cl/CLMiner_kernel.cl
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
PROGPOW_REPLACE_HEADER

#define OPENCL_PLATFORM_UNKNOWN 0
#define OPENCL_PLATFORM_NVIDIA 1
#define OPENCL_PLATFORM_AMD 2
Expand All @@ -17,6 +19,8 @@

#define HASHES_PER_GROUP (GROUP_SIZE / PROGPOW_LANES)

#define FNV_PRIME 0x1000193

typedef struct
{
uint32_t uint32s[32 / sizeof(uint32_t)];
Expand Down Expand Up @@ -210,9 +214,10 @@ ethash_search(__global struct SearchResults* restrict g_output, __constant hash3
// initialize mix for all lanes
fill_mix(hash_seed, lane_id, mix);

#pragma unroll 1
for (uint32_t l = 0; l < PROGPOW_CNT_DAG; l++)
progPowLoop(l, mix, g_dag, c_dag, share[0].uint64s, hack_false);
#pragma unroll 2
for (uint32_t loop = 0; loop < PROGPOW_CNT_DAG; loop++) {
PROGPOW_REPLACE_MATH
}

// Reduce mix data to a per-lane 32-bit digest
uint32_t mix_hash = 0x811c9dc5;
Expand Down Expand Up @@ -256,15 +261,9 @@ ethash_search(__global struct SearchResults* restrict g_output, __constant hash3
//


#ifndef LIGHT_WORDS
#define LIGHT_WORDS 262139
#endif

#define ETHASH_DATASET_PARENTS 256
#define NODE_WORDS (64 / 4)

#define FNV_PRIME 0x01000193

__constant uint2 const Keccak_f1600_RC[24] = {
(uint2)(0x00000001, 0x00000000),
(uint2)(0x00008082, 0x00000000),
Expand Down Expand Up @@ -509,27 +508,33 @@ static void SHA3_512(uint2* s, uint isolate)
keccak_f1600_no_absorb(s, 8, isolate);
}

static uint fast_mod(uint a, uint4 d)
{
const ulong t = a;
const uint q = ((t + d.y) * d.x) >> d.z;
return a - q * d.w;
}

__kernel void ethash_calculate_dag_item(
uint start, __global hash64_t const* g_light, __global hash64_t* g_dag, uint isolate)
uint start, __global hash64_t const* g_light, __global hash64_t* g_dag, uint isolate, uint dag_words, uint4 light_words)
{
uint const node_index = start + get_global_id(0);
if (node_index * sizeof(hash64_t) >= PROGPOW_DAG_BYTES)
return;
if (node_index >= dag_words)
return;

hash200_t dag_node;
copy(dag_node.uint4s, g_light[node_index % LIGHT_WORDS].uint4s, 4);
dag_node.words[0] ^= node_index;
SHA3_512(dag_node.uint2s, isolate);
hash200_t dag_node;
copy(dag_node.uint4s, g_light[fast_mod(node_index, light_words)].uint4s, 4);
dag_node.words[0] ^= node_index;
SHA3_512(dag_node.uint2s, isolate);

for (uint i = 0; i != ETHASH_DATASET_PARENTS; ++i)
{
uint parent_index = fnv(node_index ^ i, dag_node.words[i % NODE_WORDS]) % LIGHT_WORDS;
for (uint i = 0; i != ETHASH_DATASET_PARENTS; ++i)
{
uint parent_index = fast_mod(fnv(node_index ^ i, dag_node.words[i % NODE_WORDS]), light_words);

for (uint w = 0; w != 4; ++w)
{
dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], g_light[parent_index].uint4s[w]);
}
}
SHA3_512(dag_node.uint2s, isolate);
copy(g_dag[node_index].uint4s, dag_node.uint4s, 4);
for (uint w = 0; w != 4; ++w)
dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], g_light[parent_index].uint4s[w]);
}

SHA3_512(dag_node.uint2s, isolate);
copy(g_dag[node_index].uint4s, dag_node.uint4s, 4);
}
3 changes: 1 addition & 2 deletions libethash-cuda/CUDAMiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -363,8 +363,7 @@ void CUDAMiner::compileKernel(uint64_t period_seed, uint64_t dag_elms, CUfunctio

const char* name = "progpow_search";

std::string text = ProgPow::getKern(period_seed, ProgPow::KERNEL_CUDA);
text += std::string(CUDAMiner_kernel);
std::string text = ProgPow::getKern(CUDAMiner_kernel, period_seed, ProgPow::KERNEL_CUDA);

std::string tmpDir;
#ifdef _WIN32
Expand Down
9 changes: 6 additions & 3 deletions libethash-cuda/CUDAMiner_kernel.cu
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
PROGPOW_REPLACE_HEADER

#ifndef MAX_SEARCH_RESULTS
#define MAX_SEARCH_RESULTS 4U
#endif
Expand Down Expand Up @@ -181,9 +183,10 @@ progpow_search(
// initialize mix for all lanes
fill_mix(hash_seed, lane_id, mix);

#pragma unroll 1
for (uint32_t l = 0; l < PROGPOW_CNT_DAG; l++)
progPowLoop(l, mix, g_dag, c_dag, hack_false);
#pragma unroll 2
for (uint32_t loop = 0; loop < PROGPOW_CNT_DAG; loop++) {
PROGPOW_REPLACE_MATH
}


// Reduce mix data to a per-lane 32-bit digest
Expand Down
78 changes: 35 additions & 43 deletions libprogpow/ProgPow.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "ProgPow.h"

#include <sstream>
#include <regex>

#define rnd() (kiss99(rnd_state))
#define mix_src() ("mix[" + std::to_string(rnd() % PROGPOW_REGS) + "]")
Expand All @@ -14,7 +15,7 @@ inline void swap(uint32_t& a, uint32_t& b)
b = t;
}

std::string ProgPow::getKern(uint64_t prog_seed, kernel_t kern)
std::string ProgPow::getKern(std::string kernel_code, uint64_t prog_seed, kernel_t kern)
{
std::stringstream ret;

Expand Down Expand Up @@ -99,45 +100,16 @@ std::string ProgPow::getKern(uint64_t prog_seed, kernel_t kern)
{
ret << "typedef struct __align__(16) {uint32_t s[PROGPOW_DAG_LOADS];} dag_t;\n";
ret << "\n";
ret << "// Inner loop for prog_seed " << prog_seed << "\n";
ret << "__device__ __forceinline__ void progPowLoop(const uint32_t loop,\n";
ret << " uint32_t mix[PROGPOW_REGS],\n";
ret << " const dag_t *g_dag,\n";
ret << " const uint32_t c_dag[PROGPOW_CACHE_WORDS],\n";
ret << " const bool hack_false)\n";
}
else
{
ret << "typedef struct __attribute__ ((aligned (16))) {uint32_t s[PROGPOW_DAG_LOADS];} dag_t;\n";
ret << "\n";
ret << "// Inner loop for prog_seed " << prog_seed << "\n";
ret << "inline void progPowLoop(const uint32_t loop,\n";
ret << " volatile uint32_t mix_arg[PROGPOW_REGS],\n";
ret << " __global const dag_t *g_dag,\n";
ret << " __local const uint32_t c_dag[PROGPOW_CACHE_WORDS],\n";
ret << " __local uint64_t share[GROUP_SHARE],\n";
ret << " const bool hack_false)\n";
}
ret << "{\n";
std::string kernel = std::regex_replace(kernel_code, std::regex("PROGPOW_REPLACE_HEADER"), ret.str());
ret.str(std::string());

ret << "dag_t data_dag;\n";
ret << "uint32_t offset, data;\n";
// Work around AMD OpenCL compiler bug
// See https://github.com/gangnamtestnet/ethcoreminer/issues/16
if (kern == KERNEL_CL)
{
ret << "uint32_t mix[PROGPOW_REGS];\n";
ret << "for(uint32_t i=0; i<PROGPOW_REGS; i++)\n";
ret << " mix[i] = mix_arg[i];\n";
}

if (kern == KERNEL_CUDA)
ret << "const uint32_t lane_id = threadIdx.x & (PROGPOW_LANES-1);\n";
else
{
ret << "const uint32_t lane_id = get_local_id(0) & (PROGPOW_LANES-1);\n";
ret << "const uint32_t group_id = get_local_id(0) / PROGPOW_LANES;\n";
}

// Global memory access
// lanes access sequential locations
Expand All @@ -149,13 +121,14 @@ std::string ProgPow::getKern(uint64_t prog_seed, kernel_t kern)
else
{
ret << "if(lane_id == (loop % PROGPOW_LANES))\n";
ret << " share[group_id] = mix[0];\n";
ret << " share[0].uint32s[group_id] = mix[0];\n";
ret << "barrier(CLK_LOCAL_MEM_FENCE);\n";
ret << "offset = share[group_id];\n";
ret << "offset = share[0].uint32s[group_id];\n";
}
ret << "offset %= PROGPOW_DAG_ELEMENTS;\n";
ret << "offset = offset * PROGPOW_LANES + (lane_id ^ loop) % PROGPOW_LANES;\n";
ret << "data_dag = g_dag[offset];\n";
ret << "dag_t data_dag = g_dag[offset];\n";

ret << "// hack to prevent compiler from reordering LD and usage\n";
if (kern == KERNEL_CUDA)
ret << "if (hack_false) __threadfence_block();\n";
Expand Down Expand Up @@ -208,16 +181,10 @@ std::string ProgPow::getKern(uint64_t prog_seed, kernel_t kern)
uint32_t r = rnd();
ret << merge(dest, "data_dag.s["+std::to_string(i)+"]", r);
}
// Work around AMD OpenCL compiler bug
if (kern == KERNEL_CL)
{
ret << "for(uint32_t i=0; i<PROGPOW_REGS; i++)\n";
ret << " mix_arg[i] = mix[i];\n";
}
ret << "}\n";
ret << "\n";

return ret.str();
kernel = std::regex_replace(kernel, std::regex("PROGPOW_REPLACE_MATH"), ret.str());
return kernel;
}

// Merge new data from b into the value in a
Expand Down Expand Up @@ -291,3 +258,28 @@ uint32_t ProgPow::kiss99(kiss99_t &st)
st.jcong = 69069 * st.jcong + 1234567;
return ((MWC^st.jcong) + st.jsr);
}

void ProgPow::calculate_fast_mod_data(uint32_t divisor, uint32_t& reciprocal, uint32_t& increment, uint32_t& shift)
{
if ((divisor & (divisor - 1)) == 0) {
reciprocal = 1;
increment = 0;
shift = 31U - clz(divisor);
}
else {
shift = 63U - clz(divisor);
const uint64_t N = 1ULL << shift;
const uint64_t q = N / divisor;
const uint64_t r = N - q * divisor;
if (r * 2 < divisor)
{
reciprocal = static_cast<uint32_t>(q);
increment = 1;
}
else
{
reciprocal = static_cast<uint32_t>(q + 1);
increment = 0;
}
}
}
14 changes: 13 additions & 1 deletion libprogpow/ProgPow.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,9 @@ class ProgPow
KERNEL_CL
} kernel_t;

static std::string getKern(uint64_t seed, kernel_t kern);

static std::string getKern(std::string kernel_code, uint64_t seed, kernel_t kern);
static void calculate_fast_mod_data(uint32_t divisor, uint32_t& reciprocal, uint32_t& increment, uint32_t& shift);
private:
static std::string math(std::string d, std::string a, std::string b, uint32_t r);
static std::string merge(std::string a, std::string b, uint32_t r);
Expand All @@ -43,4 +44,15 @@ class ProgPow
uint32_t z, w, jsr, jcong;
} kiss99_t;
static uint32_t kiss99(kiss99_t &st);

static uint32_t clz(uint32_t a)
{
#ifdef _MSC_VER
unsigned long index;
_BitScanReverse(&index, a);
return 31 - index;
#else
return __builtin_clz(a);
#endif
}
};

0 comments on commit 864e43c

Please sign in to comment.