Fix and improve the OpenCL implementation (#14)

Co-authored-by: Kaleinthranx <[email protected]>
dominant-strategies · Nov 7, 2023 · 864e43c · 864e43c
1 parent d7ecaaf
commit 864e43c
Show file tree

Hide file tree

Showing 9 changed files with 3,623 additions and 1,487 deletions.
diff --git a/cmake/Hunter/config.cmake b/cmake/Hunter/config.cmake
@@ -1,2 +1,7 @@
 hunter_config(CURL VERSION ${HUNTER_CURL_VERSION} CMAKE_ARGS HTTP_ONLY=ON CMAKE_USE_OPENSSL=OFF CMAKE_USE_LIBSSH2=OFF CURL_CA_PATH=none)
-hunter_config(Boost VERSION 1.66.0)
+hunter_config(Boost VERSION 1.70.0-p0)
+
+hunter_config(OpenCL VERSION
+    URL https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v2023.04.17/OpenCL-SDK-v2023.04.17-Source.tar.gz
+    SHA1 aca203982e9f1cdbe71ed93ae7e0c217b1d93a37
+)
diff --git a/libethash-cl/CL/cl2.hpp → libethash-cl/CL/opencl.hpp b/libethash-cl/CL/cl2.hpp → libethash-cl/CL/opencl.hpp
diff --git a/libethash-cl/CLMiner.cpp b/libethash-cl/CLMiner.cpp
@@ -695,6 +695,16 @@ bool CLMiner::initDevice()
               << m_settings.globalWorkSize / m_settings.localWorkSize;
     }
 
+#ifndef __clang__
+        // Nvidia
+        if (!m_deviceDescriptor.clNvCompute.empty())
+        {
+            m_computeCapability =
+                m_deviceDescriptor.clNvComputeMajor * 10 + m_deviceDescriptor.clNvComputeMinor;
+            int maxregs = m_computeCapability >= 35 ? 72 : 63;
+            sprintf(m_options, "-cl-nv-maxrregcount=%d", maxregs);
+        }
+#endif
 
     return true;
 
@@ -726,18 +736,6 @@ bool CLMiner::initEpoch_internal()
     try
     {
         char options[256] = {0};
-#ifndef __clang__
-
-        // Nvidia
-        if (!m_deviceDescriptor.clNvCompute.empty())
-        {
-            m_computeCapability =
-                m_deviceDescriptor.clNvComputeMajor * 10 + m_deviceDescriptor.clNvComputeMinor;
-            int maxregs = m_computeCapability >= 35 ? 72 : 63;
-            sprintf(m_options, "-cl-nv-maxrregcount=%d", maxregs);
-        }
-
-#endif
 
         m_dagItems = m_epochContext.dagNumItems;
 
@@ -836,9 +834,15 @@ bool CLMiner::initEpoch_internal()
         // GPU DAG buffer to kernel
         m_searchKernel.setArg(2, *m_dag);
 
-        m_dagKernel.setArg(1, *m_light);
-        m_dagKernel.setArg(2, *m_dag);
+        uint32_t light_words4[4];
+        ProgPow::calculate_fast_mod_data(m_epochContext.lightNumItems, light_words4[0], light_words4[1], light_words4[2]);
+        light_words4[3] = m_epochContext.lightNumItems;
+
+        m_dagKernel.setArg(1, m_light[0]);
+        m_dagKernel.setArg(2, m_dag[0]);
         m_dagKernel.setArg(3, -1);
+        m_dagKernel.setArg(4, (uint32_t)(m_epochContext.dagSize / sizeof(ethash_hash512)));
+        m_dagKernel.setArg(5, light_words4);
 
         const uint32_t workItems = m_dagItems * 2;  // GPU computes partial 512-bit DAG items.
 
@@ -889,13 +893,10 @@ void CLMiner::asyncCompile()
 
 void CLMiner::compileKernel(uint64_t period_seed, cl::Program& program, cl::Kernel& searchKernel)
 {
-    std::string code = ProgPow::getKern(period_seed, ProgPow::KERNEL_CL);
-    code += string(CLMiner_kernel);
+    std::string code = ProgPow::getKern(CLMiner_kernel, period_seed, ProgPow::KERNEL_CL);
 
     addDefinition(code, "GROUP_SIZE", m_settings.localWorkSize);
     addDefinition(code, "ACCESSES", 64);
-    addDefinition(code, "LIGHT_WORDS", m_epochContext.lightNumItems);
-    addDefinition(code, "PROGPOW_DAG_BYTES", m_epochContext.dagSize);
     addDefinition(code, "PROGPOW_DAG_ELEMENTS", m_epochContext.dagNumItems / 2);
 
     addDefinition(code, "MAX_OUTPUTS", c_maxSearchResults);

diff --git a/libethash-cl/CLMiner.h b/libethash-cl/CLMiner.h
@@ -25,7 +25,7 @@
 #define CL_HPP_CL_1_2_DEFAULT_BUILD true
 #define CL_HPP_TARGET_OPENCL_VERSION 120
 #define CL_HPP_MINIMUM_OPENCL_VERSION 120
-#include "CL/cl2.hpp"
+#include "CL/opencl.hpp"
 #pragma GCC diagnostic pop
 
 // macOS OpenCL fix:

diff --git a/libethash-cl/CLMiner_kernel.cl b/libethash-cl/CLMiner_kernel.cl
@@ -1,3 +1,5 @@
+PROGPOW_REPLACE_HEADER
+
 #define OPENCL_PLATFORM_UNKNOWN 0
 #define OPENCL_PLATFORM_NVIDIA 1
 #define OPENCL_PLATFORM_AMD 2
@@ -17,6 +19,8 @@
 
 #define HASHES_PER_GROUP (GROUP_SIZE / PROGPOW_LANES)
 
+#define FNV_PRIME 0x1000193
+
 typedef struct
 {
     uint32_t uint32s[32 / sizeof(uint32_t)];
@@ -210,9 +214,10 @@ ethash_search(__global struct SearchResults* restrict g_output, __constant hash3
         // initialize mix for all lanes
         fill_mix(hash_seed, lane_id, mix);
 
-#pragma unroll 1
-        for (uint32_t l = 0; l < PROGPOW_CNT_DAG; l++)
-            progPowLoop(l, mix, g_dag, c_dag, share[0].uint64s, hack_false);
+#pragma unroll 2
+        for (uint32_t loop = 0; loop < PROGPOW_CNT_DAG; loop++) {
+            PROGPOW_REPLACE_MATH
+		}
 
         // Reduce mix data to a per-lane 32-bit digest
         uint32_t mix_hash = 0x811c9dc5;
@@ -256,15 +261,9 @@ ethash_search(__global struct SearchResults* restrict g_output, __constant hash3
 //
 
 
-#ifndef LIGHT_WORDS
-#define LIGHT_WORDS 262139
-#endif
-
 #define ETHASH_DATASET_PARENTS 256
 #define NODE_WORDS (64 / 4)
 
-#define FNV_PRIME 0x01000193
-
 __constant uint2 const Keccak_f1600_RC[24] = {
     (uint2)(0x00000001, 0x00000000),
     (uint2)(0x00008082, 0x00000000),
@@ -509,27 +508,33 @@ static void SHA3_512(uint2* s, uint isolate)
     keccak_f1600_no_absorb(s, 8, isolate);
 }
 
+static uint fast_mod(uint a, uint4 d)
+{
+	const ulong t = a;
+	const uint q = ((t + d.y) * d.x) >> d.z;
+	return a - q * d.w;
+}
+
 __kernel void ethash_calculate_dag_item(
-    uint start, __global hash64_t const* g_light, __global hash64_t* g_dag, uint isolate)
+    uint start, __global hash64_t const* g_light, __global hash64_t* g_dag, uint isolate, uint dag_words, uint4 light_words)
 {
     uint const node_index = start + get_global_id(0);
-    if (node_index * sizeof(hash64_t) >= PROGPOW_DAG_BYTES)
-        return;
+	if (node_index >= dag_words)
+		return;
 
-    hash200_t dag_node;
-    copy(dag_node.uint4s, g_light[node_index % LIGHT_WORDS].uint4s, 4);
-    dag_node.words[0] ^= node_index;
-    SHA3_512(dag_node.uint2s, isolate);
+	hash200_t dag_node;
+	copy(dag_node.uint4s, g_light[fast_mod(node_index, light_words)].uint4s, 4);
+	dag_node.words[0] ^= node_index;
+	SHA3_512(dag_node.uint2s, isolate);
 
-    for (uint i = 0; i != ETHASH_DATASET_PARENTS; ++i)
-    {
-        uint parent_index = fnv(node_index ^ i, dag_node.words[i % NODE_WORDS]) % LIGHT_WORDS;
+	for (uint i = 0; i != ETHASH_DATASET_PARENTS; ++i)
+	{
+		uint parent_index = fast_mod(fnv(node_index ^ i, dag_node.words[i % NODE_WORDS]), light_words);
 
-        for (uint w = 0; w != 4; ++w)
-        {
-            dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], g_light[parent_index].uint4s[w]);
-        }
-    }
-    SHA3_512(dag_node.uint2s, isolate);
-    copy(g_dag[node_index].uint4s, dag_node.uint4s, 4);
+		for (uint w = 0; w != 4; ++w)
+			dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], g_light[parent_index].uint4s[w]);
+	}
+
+	SHA3_512(dag_node.uint2s, isolate);
+	copy(g_dag[node_index].uint4s, dag_node.uint4s, 4);
 }
diff --git a/libethash-cuda/CUDAMiner.cpp b/libethash-cuda/CUDAMiner.cpp
@@ -363,8 +363,7 @@ void CUDAMiner::compileKernel(uint64_t period_seed, uint64_t dag_elms, CUfunctio
 
     const char* name = "progpow_search";
 
-    std::string text = ProgPow::getKern(period_seed, ProgPow::KERNEL_CUDA);
-    text += std::string(CUDAMiner_kernel);
+    std::string text = ProgPow::getKern(CUDAMiner_kernel, period_seed, ProgPow::KERNEL_CUDA);
 
     std::string tmpDir;
 #ifdef _WIN32

diff --git a/libethash-cuda/CUDAMiner_kernel.cu b/libethash-cuda/CUDAMiner_kernel.cu
@@ -1,3 +1,5 @@
+PROGPOW_REPLACE_HEADER
+
 #ifndef MAX_SEARCH_RESULTS
 #define MAX_SEARCH_RESULTS 4U
 #endif
@@ -181,9 +183,10 @@ progpow_search(
         // initialize mix for all lanes
         fill_mix(hash_seed, lane_id, mix);
 
-        #pragma unroll 1
-        for (uint32_t l = 0; l < PROGPOW_CNT_DAG; l++)
-            progPowLoop(l, mix, g_dag, c_dag, hack_false);
+        #pragma unroll 2
+        for (uint32_t loop = 0; loop < PROGPOW_CNT_DAG; loop++) {
+            PROGPOW_REPLACE_MATH
+		}
 
 
         // Reduce mix data to a per-lane 32-bit digest

diff --git a/libprogpow/ProgPow.cpp b/libprogpow/ProgPow.cpp
@@ -1,6 +1,7 @@
 #include "ProgPow.h"
 
 #include <sstream>
+#include <regex>
 
 #define rnd() (kiss99(rnd_state))
 #define mix_src()   ("mix[" + std::to_string(rnd() % PROGPOW_REGS) + "]")
@@ -14,7 +15,7 @@ inline void swap(uint32_t& a, uint32_t& b)
     b = t;
 }
 
-std::string ProgPow::getKern(uint64_t prog_seed, kernel_t kern)
+std::string ProgPow::getKern(std::string kernel_code, uint64_t prog_seed, kernel_t kern)
 {
     std::stringstream ret;
 
@@ -99,45 +100,16 @@ std::string ProgPow::getKern(uint64_t prog_seed, kernel_t kern)
     {
         ret << "typedef struct __align__(16) {uint32_t s[PROGPOW_DAG_LOADS];} dag_t;\n";
         ret << "\n";
-        ret << "// Inner loop for prog_seed " << prog_seed << "\n";
-        ret << "__device__ __forceinline__ void progPowLoop(const uint32_t loop,\n";
-        ret << "        uint32_t mix[PROGPOW_REGS],\n";
-        ret << "        const dag_t *g_dag,\n";
-        ret << "        const uint32_t c_dag[PROGPOW_CACHE_WORDS],\n";
-        ret << "        const bool hack_false)\n";
     }
     else
     {
         ret << "typedef struct __attribute__ ((aligned (16))) {uint32_t s[PROGPOW_DAG_LOADS];} dag_t;\n";
         ret << "\n";
-        ret << "// Inner loop for prog_seed " << prog_seed << "\n";
-        ret << "inline void progPowLoop(const uint32_t loop,\n";
-        ret << "        volatile uint32_t mix_arg[PROGPOW_REGS],\n";
-        ret << "        __global const dag_t *g_dag,\n";
-        ret << "        __local const uint32_t c_dag[PROGPOW_CACHE_WORDS],\n";
-        ret << "        __local uint64_t share[GROUP_SHARE],\n";
-        ret << "        const bool hack_false)\n";
     }
-    ret << "{\n";
+    std::string kernel = std::regex_replace(kernel_code, std::regex("PROGPOW_REPLACE_HEADER"), ret.str());
+    ret.str(std::string());
 
-    ret << "dag_t data_dag;\n";
     ret << "uint32_t offset, data;\n";
-    // Work around AMD OpenCL compiler bug
-    // See https://github.com/gangnamtestnet/ethcoreminer/issues/16
-    if (kern == KERNEL_CL)
-    {
-        ret << "uint32_t mix[PROGPOW_REGS];\n";
-        ret << "for(uint32_t i=0; i<PROGPOW_REGS; i++)\n";
-        ret << "    mix[i] = mix_arg[i];\n";
-    }
-
-    if (kern == KERNEL_CUDA)
-        ret << "const uint32_t lane_id = threadIdx.x & (PROGPOW_LANES-1);\n";
-    else
-    {
-        ret << "const uint32_t lane_id = get_local_id(0) & (PROGPOW_LANES-1);\n";
-        ret << "const uint32_t group_id = get_local_id(0) / PROGPOW_LANES;\n";
-    }
 
     // Global memory access
     // lanes access sequential locations
@@ -149,13 +121,14 @@ std::string ProgPow::getKern(uint64_t prog_seed, kernel_t kern)
     else
     {
         ret << "if(lane_id == (loop % PROGPOW_LANES))\n";
-        ret << "    share[group_id] = mix[0];\n";
+        ret << "    share[0].uint32s[group_id] = mix[0];\n";
         ret << "barrier(CLK_LOCAL_MEM_FENCE);\n";
-        ret << "offset = share[group_id];\n";
+        ret << "offset = share[0].uint32s[group_id];\n";
     }
     ret << "offset %= PROGPOW_DAG_ELEMENTS;\n";
     ret << "offset = offset * PROGPOW_LANES + (lane_id ^ loop) % PROGPOW_LANES;\n";
-    ret << "data_dag = g_dag[offset];\n";
+    ret << "dag_t data_dag = g_dag[offset];\n";
+
     ret << "// hack to prevent compiler from reordering LD and usage\n";
     if (kern == KERNEL_CUDA)
         ret << "if (hack_false) __threadfence_block();\n";
@@ -208,16 +181,10 @@ std::string ProgPow::getKern(uint64_t prog_seed, kernel_t kern)
         uint32_t    r = rnd();
         ret << merge(dest, "data_dag.s["+std::to_string(i)+"]", r);
     }
-    // Work around AMD OpenCL compiler bug
-    if (kern == KERNEL_CL)
-    {
-        ret << "for(uint32_t i=0; i<PROGPOW_REGS; i++)\n";
-        ret << "    mix_arg[i] = mix[i];\n";
-    }
-    ret << "}\n";
     ret << "\n";
 
-    return ret.str();
+    kernel = std::regex_replace(kernel, std::regex("PROGPOW_REPLACE_MATH"), ret.str());
+    return kernel;
 }
 
 // Merge new data from b into the value in a
@@ -291,3 +258,28 @@ uint32_t ProgPow::kiss99(kiss99_t &st)
     st.jcong = 69069 * st.jcong + 1234567;
     return ((MWC^st.jcong) + st.jsr);
 }
+
+void ProgPow::calculate_fast_mod_data(uint32_t divisor, uint32_t& reciprocal, uint32_t& increment, uint32_t& shift)
+{
+    if ((divisor & (divisor - 1)) == 0) {
+        reciprocal = 1;
+        increment = 0;
+        shift = 31U - clz(divisor);
+    }
+    else {
+        shift = 63U - clz(divisor);
+        const uint64_t N = 1ULL << shift;
+        const uint64_t q = N / divisor;
+        const uint64_t r = N - q * divisor;
+        if (r * 2 < divisor)
+        {
+            reciprocal = static_cast<uint32_t>(q);
+            increment = 1;
+        }
+        else
+        {
+            reciprocal = static_cast<uint32_t>(q + 1);
+            increment = 0;
+        }
+    }
+}
diff --git a/libprogpow/ProgPow.h b/libprogpow/ProgPow.h
@@ -29,8 +29,9 @@ class ProgPow
         KERNEL_CL
     } kernel_t;
 
-    static std::string getKern(uint64_t seed, kernel_t kern);
 
+	static std::string getKern(std::string kernel_code, uint64_t seed, kernel_t kern);
+    static void calculate_fast_mod_data(uint32_t divisor, uint32_t& reciprocal, uint32_t& increment, uint32_t& shift);
 private:
     static std::string math(std::string d, std::string a, std::string b, uint32_t r);
     static std::string merge(std::string a, std::string b, uint32_t r);
@@ -43,4 +44,15 @@ class ProgPow
         uint32_t z, w, jsr, jcong;
     } kiss99_t;
     static uint32_t kiss99(kiss99_t &st);
+
+    static uint32_t clz(uint32_t a)
+    {
+#ifdef _MSC_VER
+        unsigned long index;
+        _BitScanReverse(&index, a);
+        return 31 - index;
+#else
+        return __builtin_clz(a);
+#endif
+    }
 };