From be3088883d49d3b7764503f4de334a91f9555326 Mon Sep 17 00:00:00 2001
From: Ioannis Magkanaris <iomagkanaris@gmail.com>
Date: Mon, 9 May 2022 14:00:15 +0200
Subject: [PATCH] [LLVM][GPU] Added CUDADriver to execute benchmark on GPU
 (#829)

- Added CUDADriver to compile LLVM IR string generated from CodegenLLVMVisitor to PTX string and then execute it using CUDA API
- Ability to select the compilation GPU architecture and then set the proper GPU architecture based on the GPU that is going to be used
- Link `libdevice` math library with GPU LLVM module
- Handles kernel and wrapper functions attributes properly for GPU execution (wrapper function is `kernel` and kernel attribute is `device`)
- Small fixes in InstanceStruct declaration and setup to allocate the pointer variables properly, including the shadow variables
- Adds tests in the CI that run small benchmarks in CPU and GPU on BB5
- Adds replacement of `log` math function for SLEEF and libdevice, `pow` and `fabs` for libdevice
- Adds GPU execution ability in PyJIT
- Small improvement in PyJIT benchmark python script to handle arguments and GPU execution
- Separated benchmark info from benchmark driver
- Added hh and expsyn mod files in benchmarking tests
---
 .gitlab-ci.yml                                |  37 ++--
 CMakeLists.txt                                |   2 +
 INSTALL.md                                    |  25 ++-
 src/codegen/codegen_driver.hpp                |   3 -
 .../llvm/codegen_llvm_helper_visitor.cpp      |  18 +-
 src/codegen/llvm/codegen_llvm_visitor.cpp     |  78 +++++--
 src/codegen/llvm/codegen_llvm_visitor.hpp     |  12 +-
 src/codegen/llvm/llvm_utils.cpp               |  35 ++-
 src/codegen/llvm/llvm_utils.hpp               |   9 +-
 .../llvm/replace_with_lib_functions.cpp       |  11 +-
 src/main.cpp                                  |  51 +++--
 src/pybind/CMakeLists.txt                     |   3 +-
 src/pybind/pynmodl.cpp                        |  36 +++-
 test/benchmark/CMakeLists.txt                 |  30 ++-
 test/benchmark/benchmark.py                   |  26 ++-
 test/benchmark/benchmark_info.hpp             |  29 +++
 test/benchmark/cuda_driver.cpp                | 201 ++++++++++++++++++
 test/benchmark/cuda_driver.hpp                | 187 ++++++++++++++++
 test/benchmark/gpu_parameters.hpp             |  27 +++
 test/benchmark/jit_driver.hpp                 |  20 +-
 test/benchmark/kernels/expsyn.mod             |  42 ++++
 test/benchmark/kernels/hh.mod                 | 125 +++++++++++
 test/benchmark/llvm_benchmark.cpp             |  55 ++++-
 test/benchmark/llvm_benchmark.hpp             |  49 ++++-
 test/integration/mod/test_math.mod            |  16 ++
 test/unit/CMakeLists.txt                      |   1 -
 .../codegen/codegen_llvm_instance_struct.cpp  |  18 +-
 test/unit/codegen/codegen_llvm_ir.cpp         |  12 +-
 test/unit/codegen/codegen_llvm_visitor.cpp    |  12 +-
 29 files changed, 1045 insertions(+), 125 deletions(-)
 create mode 100644 test/benchmark/benchmark_info.hpp
 create mode 100644 test/benchmark/cuda_driver.cpp
 create mode 100644 test/benchmark/cuda_driver.hpp
 create mode 100644 test/benchmark/gpu_parameters.hpp
 create mode 100644 test/benchmark/kernels/expsyn.mod
 create mode 100644 test/benchmark/kernels/hh.mod
 create mode 100644 test/integration/mod/test_math.mod

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 438e74ebfa..ffd224b758 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -27,7 +27,6 @@ trigger cvf:
   variables:
     SPACK_PACKAGE: nmodl
     SPACK_PACKAGE_SPEC: ~legacy-unit+python+llvm
-    SPACK_EXTRA_MODULES: llvm
     SPACK_INSTALL_EXTRA_FLAGS: -v
 
 spack_setup:
@@ -45,14 +44,6 @@ build:intel:
   variables:
     SPACK_PACKAGE_COMPILER: intel
 
-build:gcc:
-  extends:
-    - .spack_build
-    - .spack_nmodl
-  variables:
-    SPACK_PACKAGE_COMPILER: gcc
-    SPACK_PACKAGE_DEPENDENCIES: ^bison%gcc^flex%gcc^py-jinja2%gcc^py-sympy%gcc^py-pyyaml%gcc
-
 .nmodl_tests:
   variables:
     # https://github.com/BlueBrain/nmodl/issues/737
@@ -64,8 +55,30 @@ test:intel:
     - .nmodl_tests
   needs: ["build:intel"]
 
-test:gcc:
+.benchmark_config:
+  variables:
+    bb5_ntasks: 1
+    bb5_cpus_per_task: 1
+    bb5_memory: 16G
+    bb5_exclusive: full
+    bb5_constraint: gpu_32g # CascadeLake CPU & V100 GPU node
+
+.build_allocation:
+  variables:
+    bb5_ntasks: 2   # so we block 16 cores
+    bb5_cpus_per_task: 8 # ninja -j {this}
+    bb5_memory: 76G # ~16*384/80
+
+build_cuda:gcc:
+  extends: [.spack_build, .build_allocation]
+  variables:
+    SPACK_PACKAGE: nmodl
+    SPACK_PACKAGE_SPEC: ~legacy-unit+python+llvm+llvm_cuda
+    SPACK_INSTALL_EXTRA_FLAGS: -v
+    SPACK_PACKAGE_COMPILER: gcc
+
+test_benchmark:gcc:
   extends:
+    - .benchmark_config
     - .ctest
-    - .nmodl_tests
-  needs: ["build:gcc"]
+  needs: ["build_cuda:gcc"]
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 9ff2061826..24695f13e1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -24,6 +24,7 @@ option(NMODL_ENABLE_PYTHON_BINDINGS "Enable pybind11 based python bindings" ON)
 option(NMODL_ENABLE_LEGACY_UNITS "Use original faraday, R, etc. instead of 2019 nist constants" OFF)
 option(NMODL_ENABLE_LLVM "Enable LLVM based code generation" ON)
 option(NMODL_ENABLE_LLVM_GPU "Enable LLVM based GPU code generation" ON)
+option(NMODL_ENABLE_LLVM_CUDA "Enable LLVM CUDA backend to run GPU benchmark" OFF)
 option(NMODL_ENABLE_JIT_EVENT_LISTENERS "Enable JITEventListener for Perf and Vtune" OFF)
 
 if(NMODL_ENABLE_LEGACY_UNITS)
@@ -162,6 +163,7 @@ if(NMODL_ENABLE_LLVM)
   if(NMODL_ENABLE_LLVM_CUDA)
     enable_language(CUDA)
     find_package(CUDAToolkit)
+    include_directories(${CUDAToolkit_INCLUDE_DIRS})
     add_definitions(-DNMODL_LLVM_CUDA_BACKEND)
   endif()
 endif()
diff --git a/INSTALL.md b/INSTALL.md
index 1b65c1212c..7ddb21b15c 100644
--- a/INSTALL.md
+++ b/INSTALL.md
@@ -21,7 +21,7 @@ To build the project from source, a modern C++ compiler with C++14 support is ne
 
 - flex (>=2.6)
 - bison (>=3.0)
-- CMake (>=3.15)
+- CMake (>=3.17)
 - Python (>=3.6)
 - Python packages : jinja2 (>=2.10), pyyaml (>=3.13), pytest (>=4.0.0), sympy (>=1.3), textwrap
 
@@ -141,6 +141,29 @@ export NMODL_WRAPLIB=/opt/nmodl/lib/libpywrapper.so
 **Note**: In order for all unit tests to function correctly when building without linking against libpython we must
 set `NMODL_PYLIB` before running cmake!
 
+### Using CUDA backend to run benchmarks
+
+`NMODL` supports generating code and compiling it for execution on an `NVIDIA` GPU via its benchmark infrastructure using the `LLVM` backend. To enable the `CUDA` backend to compile and execute the GPU code we need to set the following `CMake` flag during compilation of `NMODL`:
+```
+-DNMODL_ENABLE_LLVM_CUDA=ON
+```
+
+To find the need `CUDA` libraries (`cudart` and `nvrtc`) it's needed to have CUDA Toolkit installed on your system.
+This can be done by installing the CUDA Toolkit from the [CUDA Toolkit website](https://developer.nvidia.com/cuda-downloads) or by installing the `CUDA` spack package and loading the corresponding module.
+
+Then given a supported MOD file you can execute the benchmark on GPU in you supported NVIDIA GPU by running the following command:
+```
+./bin/nmodl <file>.mod llvm --no-debug --ir --opt-level-ir 3 gpu --target-arch "sm_80" --name "nvptx64" --math-library libdevice benchmark --run --libs "${CUDA_ROOT}/nvvm/libdevice/libdevice.10.bc" --opt-level-codegen 3 --instance-size 10000000 --repeat 2 --grid-dim-x 4096 --block-dim-x 256
+```
+The above command executes the benchmark on a GPU with `Compute Architecture` `sm_80` and links the generated code to the `libdevice` optimized math library provided by `NVIDIA`.
+Using the above command you can also select the optimization level of the generated code, the instance size of the generated data, the number of repetitions and the grid and block dimensions for the GPU execution.
+
+**Note**: In order for the CUDA backend to be able to compile and execute the generated code on GPU the CUDA Toolkit version installed needs to have the same version as the `CUDA` installed by the NVIDIA driver in the system that will be used to run the benchmark.
+You can find the CUDA Toolkit version by running the following command:
+```
+nvidia-smi
+```
+and noting the `CUDA Version` stated there. For example if `CUDA Version` reported by `nvidia-smi` is CUDA 11.4 you need to install the `CUDA Toolkit 11.4.*` to be able to compile and execute the GPU code.
 
 ## Testing the Installed Module
 
diff --git a/src/codegen/codegen_driver.hpp b/src/codegen/codegen_driver.hpp
index 78c95421da..14d8ed76ab 100644
--- a/src/codegen/codegen_driver.hpp
+++ b/src/codegen/codegen_driver.hpp
@@ -33,9 +33,6 @@ struct CodeGenConfig {
     /// true if cuda code to be generated
     bool cuda_backend = false;
 
-    /// true if llvm code to be generated
-    bool llvm_backend = false;
-
     /// true if sympy should be used for solving ODEs analytically
     bool sympy_analytic = false;
 
diff --git a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
index 5f8119a4d1..5800beae6b 100644
--- a/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_helper_visitor.cpp
@@ -239,13 +239,6 @@ std::shared_ptr<ast::InstanceStruct> CodegenLLVMHelperVisitor::create_instance_s
     add_var_with_type(VOLTAGE_VAR, FLOAT_TYPE, /*is_pointer=*/1);
     add_var_with_type(NODE_INDEX_VAR, INTEGER_TYPE, /*is_pointer=*/1);
 
-    // add dt, t, celsius
-    add_var_with_type(naming::NTHREAD_T_VARIABLE, FLOAT_TYPE, /*is_pointer=*/0);
-    add_var_with_type(naming::NTHREAD_DT_VARIABLE, FLOAT_TYPE, /*is_pointer=*/0);
-    add_var_with_type(naming::CELSIUS_VARIABLE, FLOAT_TYPE, /*is_pointer=*/0);
-    add_var_with_type(naming::SECOND_ORDER_VARIABLE, INTEGER_TYPE, /*is_pointer=*/0);
-    add_var_with_type(naming::MECH_NODECOUNT_VAR, INTEGER_TYPE, /*is_pointer=*/0);
-
     // As we do not have `NrnThread` object as an argument, we store points to rhs
     // and d to in the instance struct as well. Also need their respective shadow variables
     // in case of point process mechanism.
@@ -256,6 +249,17 @@ std::shared_ptr<ast::InstanceStruct> CodegenLLVMHelperVisitor::create_instance_s
     add_var_with_type(naming::NTHREAD_RHS_SHADOW, FLOAT_TYPE, /*is_pointer=*/1);
     add_var_with_type(naming::NTHREAD_D_SHADOW, FLOAT_TYPE, /*is_pointer=*/1);
 
+    // NOTE: All the pointer variables should be declared before the scalar variables otherwise
+    // the allocation of memory for the variables in the InstanceStruct and their offsets will be
+    // wrong
+
+    // add dt, t, celsius
+    add_var_with_type(naming::NTHREAD_T_VARIABLE, FLOAT_TYPE, /*is_pointer=*/0);
+    add_var_with_type(naming::NTHREAD_DT_VARIABLE, FLOAT_TYPE, /*is_pointer=*/0);
+    add_var_with_type(naming::CELSIUS_VARIABLE, FLOAT_TYPE, /*is_pointer=*/0);
+    add_var_with_type(naming::SECOND_ORDER_VARIABLE, INTEGER_TYPE, /*is_pointer=*/0);
+    add_var_with_type(naming::MECH_NODECOUNT_VAR, INTEGER_TYPE, /*is_pointer=*/0);
+
     return std::make_shared<ast::InstanceStruct>(codegen_vars);
 }
 
diff --git a/src/codegen/llvm/codegen_llvm_visitor.cpp b/src/codegen/llvm/codegen_llvm_visitor.cpp
index 9e159f7aff..de6c7ad914 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.cpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.cpp
@@ -25,6 +25,10 @@ namespace codegen {
 /*                                  Helper routines                                     */
 /****************************************************************************************/
 
+static std::string get_wrapper_name(const std::string& kernel_name) {
+    return "__" + kernel_name + "_wrapper";
+}
+
 /// A utility to check for supported Statement AST nodes.
 static bool is_supported_statement(const ast::Statement& statement) {
     return statement.is_codegen_atomic_statement() || statement.is_codegen_for_statement() ||
@@ -55,15 +59,36 @@ static bool can_vectorize(const ast::CodegenForStatement& statement, symtab::Sym
     return unsupported.empty() && supported.size() <= 1;
 }
 
-void CodegenLLVMVisitor::annotate_kernel_with_nvvm(llvm::Function* kernel) {
+void CodegenLLVMVisitor::annotate_kernel_with_nvvm(llvm::Function* kernel,
+                                                   const std::string& annotation = "kernel") {
     llvm::Metadata* metadata[] = {llvm::ValueAsMetadata::get(kernel),
-                                  llvm::MDString::get(*context, "kernel"),
+                                  llvm::MDString::get(*context, annotation),
                                   llvm::ValueAsMetadata::get(
                                       llvm::ConstantInt::get(llvm::Type::getInt32Ty(*context), 1))};
     llvm::MDNode* node = llvm::MDNode::get(*context, metadata);
     module->getOrInsertNamedMetadata("nvvm.annotations")->addOperand(node);
 }
 
+void CodegenLLVMVisitor::annotate_wrapper_kernels_with_nvvm() {
+    // First clear all the nvvm annotations from the module
+    auto module_named_metadata = module->getNamedMetadata("nvvm.annotations");
+    module->eraseNamedMetadata(module_named_metadata);
+
+    // Then each kernel should be annotated as "device" function and wrappers should be annotated as
+    // "kernel" functions
+    std::vector<std::string> kernel_names;
+    find_kernel_names(kernel_names);
+
+    for (const auto& kernel_name: kernel_names) {
+        // Get the kernel function.
+        auto kernel = module->getFunction(kernel_name);
+        // Get the kernel wrapper function.
+        auto kernel_wrapper = module->getFunction(get_wrapper_name(kernel_name));
+        annotate_kernel_with_nvvm(kernel, "device");
+        annotate_kernel_with_nvvm(kernel_wrapper, "kernel");
+    }
+}
+
 llvm::Value* CodegenLLVMVisitor::accept_and_get(const std::shared_ptr<ast::Node>& node) {
     node->accept(*this);
     return ir_builder.pop_last_value();
@@ -402,12 +427,17 @@ void CodegenLLVMVisitor::wrap_kernel_functions() {
         auto kernel = module->getFunction(kernel_name);
 
         // Create a wrapper void function that takes a void pointer as a single argument.
-        llvm::Type* i32_type = ir_builder.get_i32_type();
+        llvm::Type* return_type;
+        if (platform.is_gpu()) {
+            return_type = ir_builder.get_void_type();
+        } else {
+            return_type = ir_builder.get_i32_type();
+        }
         llvm::Type* void_ptr_type = ir_builder.get_i8_ptr_type();
         llvm::Function* wrapper_func = llvm::Function::Create(
-            llvm::FunctionType::get(i32_type, {void_ptr_type}, /*isVarArg=*/false),
+            llvm::FunctionType::get(return_type, {void_ptr_type}, /*isVarArg=*/false),
             llvm::Function::ExternalLinkage,
-            "__" + kernel_name + "_wrapper",
+            get_wrapper_name(kernel_name),
             *module);
 
         // Optionally, add debug information for the wrapper function.
@@ -425,9 +455,23 @@ void CodegenLLVMVisitor::wrap_kernel_functions() {
         args.push_back(bitcasted);
         ir_builder.create_function_call(kernel, args, /*use_result=*/false);
 
-        // Create a 0 return value and a return instruction.
-        ir_builder.create_i32_constant(0);
-        ir_builder.create_return(ir_builder.pop_last_value());
+        // create return instructions and annotate wrapper with certain attributes depending on
+        // the backend type
+        if (platform.is_gpu()) {
+            // return void
+            ir_builder.create_return();
+        } else {
+            // Create a 0 return value and a return instruction.
+            ir_builder.create_i32_constant(0);
+            ir_builder.create_return(ir_builder.pop_last_value());
+            ir_builder.set_function(wrapper_func);
+            ir_builder.set_kernel_attributes();
+        }
+        ir_builder.clear_function();
+    }
+    // for GPU we need to first clear all the annotations and then reapply them
+    if (platform.is_gpu()) {
+        annotate_wrapper_kernels_with_nvvm();
     }
 }
 
@@ -823,9 +867,6 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
 
     // Handle GPU optimizations (CUDA platfroms only for now).
     if (platform.is_gpu()) {
-        if (!platform.is_CUDA_gpu())
-            throw std::runtime_error("Error: unsupported GPU architecture!\n");
-
         // We only support CUDA backends anyway, so this works for now.
         utils::initialise_nvptx_passes();
 
@@ -839,15 +880,12 @@ void CodegenLLVMVisitor::visit_program(const ast::Program& node) {
         logger->debug("Dumping generated IR...\n" + dump_module());
     }
 
-    // If the output directory is specified, save the IR to .ll file.
-    if (output_dir != ".") {
-        utils::save_ir_to_ll_file(*module, output_dir + "/" + mod_filename);
-    }
-
     // Setup CodegenHelper for C++ wrapper file
     setup(node);
+    // Print C++ wrapper file
     print_wrapper_routines();
-    print_target_file();
+    // Print LLVM IR module to <mod_filename>.ll file
+    utils::save_ir_to_ll_file(*module, output_dir + "/" + mod_filename);
 }
 
 void CodegenLLVMVisitor::print_mechanism_range_var_structure() {
@@ -960,6 +998,12 @@ void CodegenLLVMVisitor::print_instance_variable_setup() {
     // Pass ml->nodeindices pointer to node_index
     printer->add_line("inst->node_index = ml->nodeindices;");
 
+    // Setup rhs, d and their shadow vectors
+    printer->add_line(fmt::format("inst->{} = nt->_actual_rhs;", naming::NTHREAD_RHS));
+    printer->add_line(fmt::format("inst->{} = nt->_actual_d;", naming::NTHREAD_D));
+    printer->add_line(fmt::format("inst->{} = nt->_shadow_rhs;", naming::NTHREAD_RHS_SHADOW));
+    printer->add_line(fmt::format("inst->{} = nt->_shadow_d;", naming::NTHREAD_D_SHADOW));
+
     // Setup global variables
     printer->add_line("inst->{0} = nt->{0};"_format(naming::NTHREAD_T_VARIABLE));
     printer->add_line("inst->{0} = nt->{0};"_format(naming::NTHREAD_DT_VARIABLE));
diff --git a/src/codegen/llvm/codegen_llvm_visitor.hpp b/src/codegen/llvm/codegen_llvm_visitor.hpp
index 0862307337..a22f698431 100644
--- a/src/codegen/llvm/codegen_llvm_visitor.hpp
+++ b/src/codegen/llvm/codegen_llvm_visitor.hpp
@@ -139,10 +139,6 @@ class CodegenLLVMVisitor: public CodegenCVisitor {
         return str;
     }
 
-    void print_target_file() const {
-        target_printer->add_multi_line(dump_module());
-    }
-
     /// Fills the container with the names of kernel functions from the MOD file.
     void find_kernel_names(std::vector<std::string>& container);
 
@@ -303,8 +299,12 @@ class CodegenLLVMVisitor: public CodegenCVisitor {
     void print_compute_functions() override;
 
   private:
-    // Annotates kernel function with NVVM metadata.
-    void annotate_kernel_with_nvvm(llvm::Function* kernel);
+    /// Annotates kernel function with NVVM metadata.
+    void annotate_kernel_with_nvvm(llvm::Function* kernel, const std::string& annotation);
+
+    /// Handles NVVM function annotations when we create the wrapper functions. All original kernels
+    /// should be "device" functions and wrappers "kernel" functions
+    void annotate_wrapper_kernels_with_nvvm();
 
     /// Accepts the given AST node and returns the processed value.
     llvm::Value* accept_and_get(const std::shared_ptr<ast::Node>& node);
diff --git a/src/codegen/llvm/llvm_utils.cpp b/src/codegen/llvm/llvm_utils.cpp
index bd4feee32f..f6590fec5b 100644
--- a/src/codegen/llvm/llvm_utils.cpp
+++ b/src/codegen/llvm/llvm_utils.cpp
@@ -75,12 +75,9 @@ void initialise_nvptx_passes() {
     initialise_optimisation_passes();
 }
 
-void optimise_module_for_nvptx(codegen::Platform& platform,
-                               llvm::Module& module,
-                               int opt_level,
-                               std::string& target_asm) {
+std::unique_ptr<llvm::TargetMachine> create_CUDA_target_machine(const codegen::Platform& platform,
+                                                                llvm::Module& module) {
     // CUDA target machine we generating code for.
-    std::unique_ptr<llvm::TargetMachine> tm;
     std::string platform_name = platform.get_name();
 
     // Target and layout information.
@@ -111,9 +108,30 @@ void optimise_module_for_nvptx(codegen::Platform& platform,
     if (!target)
         throw std::runtime_error("Error: " + error_msg + "\n");
 
+    std::unique_ptr<llvm::TargetMachine> tm;
     tm.reset(target->createTargetMachine(triple, subtarget, features, {}, {}));
     if (!tm)
         throw std::runtime_error("Error: creating target machine failed! Aborting.");
+    return tm;
+}
+
+std::string get_module_ptx(llvm::TargetMachine& tm, llvm::Module& module) {
+    std::string target_asm;
+    llvm::raw_string_ostream stream(target_asm);
+    llvm::buffer_ostream pstream(stream);
+    llvm::legacy::PassManager codegen_pm;
+
+    tm.addPassesToEmitFile(codegen_pm, pstream, nullptr, llvm::CGFT_AssemblyFile);
+    codegen_pm.run(module);
+    return target_asm;
+}
+
+void optimise_module_for_nvptx(const codegen::Platform& platform,
+                               llvm::Module& module,
+                               int opt_level,
+                               std::string& target_asm) {
+    // Create target machine for CUDA GPU
+    auto tm = create_CUDA_target_machine(platform, module);
 
     // Create pass managers.
     llvm::legacy::FunctionPassManager func_pm(&module);
@@ -137,12 +155,7 @@ void optimise_module_for_nvptx(codegen::Platform& platform,
 
     // Now, we want to run target-specific (e.g. NVPTX) passes. In LLVM, this
     // is done via `addPassesToEmitFile`.
-    llvm::raw_string_ostream stream(target_asm);
-    llvm::buffer_ostream pstream(stream);
-    llvm::legacy::PassManager codegen_pm;
-
-    tm->addPassesToEmitFile(codegen_pm, pstream, nullptr, llvm::CGFT_AssemblyFile);
-    codegen_pm.run(module);
+    target_asm = get_module_ptx(*tm, module);
 }
 
 void initialise_optimisation_passes() {
diff --git a/src/codegen/llvm/llvm_utils.hpp b/src/codegen/llvm/llvm_utils.hpp
index 3394463317..9763718ab0 100644
--- a/src/codegen/llvm/llvm_utils.hpp
+++ b/src/codegen/llvm/llvm_utils.hpp
@@ -21,11 +21,18 @@ void initialise_optimisation_passes();
 /// Initialises NVPTX-specific optimisation passes.
 void initialise_nvptx_passes();
 
+//// Initializes a CUDA target machine
+std::unique_ptr<llvm::TargetMachine> create_CUDA_target_machine(const codegen::Platform& platform,
+                                                                llvm::Module& module);
+
+/// Generate PTX code given a CUDA target machine and the module
+std::string get_module_ptx(llvm::TargetMachine& tm, llvm::Module& module);
+
 /// Replaces calls to LLVM intrinsics with appropriate library calls.
 void replace_with_lib_functions(codegen::Platform& platform, llvm::Module& module);
 
 /// Optimises the given LLVM IR module for NVPTX targets.
-void optimise_module_for_nvptx(codegen::Platform& platform,
+void optimise_module_for_nvptx(const codegen::Platform& platform,
                                llvm::Module& module,
                                int opt_level,
                                std::string& target_asm);
diff --git a/src/codegen/llvm/replace_with_lib_functions.cpp b/src/codegen/llvm/replace_with_lib_functions.cpp
index 6d98dd3eb0..750e2c2318 100644
--- a/src/codegen/llvm/replace_with_lib_functions.cpp
+++ b/src/codegen/llvm/replace_with_lib_functions.cpp
@@ -72,6 +72,8 @@ void ReplaceMathFunctions::add_vectorizable_functions_from_vec_lib(TargetLibrary
             DISPATCH("llvm.exp.f64", "_ZGVnN2v_exp", FIXED(2))
             DISPATCH("llvm.pow.f32", "_ZGVnN4vv_powf", FIXED(4))
             DISPATCH("llvm.pow.f64", "_ZGVnN2vv_pow", FIXED(2))
+            DISPATCH("llvm.log.f32", "_ZGVnN4v_logf", FIXED(4))
+            DISPATCH("llvm.log.f64", "_ZGVnN2v_log", FIXED(2))
             // clang-format on
         };
         const VecDesc x86_functions[] = {
@@ -82,6 +84,9 @@ void ReplaceMathFunctions::add_vectorizable_functions_from_vec_lib(TargetLibrary
             DISPATCH("llvm.pow.f64", "_ZGVbN2vv_pow", FIXED(2))
             DISPATCH("llvm.pow.f64", "_ZGVdN4vv_pow", FIXED(4))
             DISPATCH("llvm.pow.f64", "_ZGVeN8vv_pow", FIXED(8))
+            DISPATCH("llvm.log.f64", "_ZGVbN2v_log", FIXED(2))
+            DISPATCH("llvm.log.f64", "_ZGVdN4v_log", FIXED(4))
+            DISPATCH("llvm.log.f64", "_ZGVeN8v_log", FIXED(8))
             // clang-format on
         };
 #undef DISPATCH
@@ -166,7 +171,11 @@ bool ReplaceWithLibdevice::replace_call(CallInst& call_inst) {
     static const std::map<std::string, std::string> libdevice_name = {{"llvm.exp.f32", "__nv_expf"},
                                                                       {"llvm.exp.f64", "__nv_exp"},
                                                                       {"llvm.pow.f32", "__nv_powf"},
-                                                                      {"llvm.pow.f64", "__nv_pow"}};
+                                                                      {"llvm.pow.f64", "__nv_pow"},
+                                                                      {"llvm.log.f32", "__nv_logf"},
+                                                                      {"llvm.log.f64", "__nv_log"},
+                                                                      {"llvm.fabs.f32", "__nv_fabsf"},
+                                                                      {"llvm.fabs.f64", "__nv_fabs"}};
 
     // If replacement is not supported, abort.
     std::string old_name = function->getName().str();
diff --git a/src/main.cpp b/src/main.cpp
index ae41b09e50..e3e9e32929 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -65,6 +65,12 @@ int main(int argc, const char* argv[]) {
 
     /// the number of repeated experiments for the benchmarking
     int num_experiments = 100;
+
+    /// X dimension of grid in blocks for GPU execution
+    int llvm_cuda_grid_dim_x = 1;
+
+    /// X dimension of block in threads for GPU execution
+    int llvm_cuda_block_dim_x = 1;
 #endif
 
     CodeGenConfig cfg;
@@ -229,9 +235,10 @@ int main(int argc, const char* argv[]) {
     auto gpu_target_name = gpu_opt->add_option("--name",
         cfg.llvm_gpu_name,
         "Name of GPU platform to use")->ignore_case();
-   gpu_opt->add_option("--target-chip",
+    gpu_target_name->check(CLI::IsMember({"nvptx", "nvptx64"}));
+    gpu_opt->add_option("--target-arch",
         cfg.llvm_gpu_target_architecture,
-        "Name of target chip to use")->ignore_case();
+        "Name of target architecture to use")->ignore_case();
     auto gpu_math_library_opt = gpu_opt->add_option("--math-library",
         cfg.llvm_math_library,
         "Math library for GPU code generation ({})"_format(cfg.llvm_math_library));
@@ -259,6 +266,12 @@ int main(int argc, const char* argv[]) {
     benchmark_opt->add_option("--repeat",
                               num_experiments,
                               "Number of experiments for benchmarking ({})"_format(num_experiments))->ignore_case();
+    benchmark_opt->add_option("--grid-dim-x",
+                              llvm_cuda_grid_dim_x,
+                              "Grid dimension X ({})"_format(llvm_cuda_grid_dim_x))->ignore_case();
+    benchmark_opt->add_option("--block-dim-x",
+                                llvm_cuda_block_dim_x,
+                                "Block dimension X ({})"_format(llvm_cuda_block_dim_x))->ignore_case();
 #endif
     // clang-format on
 
@@ -373,11 +386,18 @@ int main(int argc, const char* argv[]) {
                                                                         : cfg.llvm_gpu_name;
                 Platform platform(pid,
                                   name,
-                                  cfg.llvm_cpu_name,
+                                  cfg.llvm_gpu_target_architecture,
                                   cfg.llvm_math_library,
                                   cfg.llvm_float_type,
                                   cfg.llvm_vector_width);
 
+                // GPU code generation doesn't support debug information at the moment so disable it
+                // in case it's enabled
+                if (!cfg.llvm_no_debug && platform.is_gpu()) {
+                    logger->warn("Disabling addition of debug symbols in GPU code.");
+                    cfg.llvm_no_debug = true;
+                }
+
                 logger->info("Running LLVM backend code generator");
                 CodegenLLVMVisitor visitor(modfile,
                                            cfg.output_dir,
@@ -398,23 +418,30 @@ int main(int argc, const char* argv[]) {
                 }
 
                 if (llvm_benchmark) {
-                    // \todo integrate Platform class here
-                    if (cfg.llvm_gpu_name != "default") {
-                        logger->warn(
-                            "GPU benchmarking is not supported, targeting "
-                            "CPU instead");
-                    }
-
                     logger->info("Running LLVM benchmark");
+                    if (platform.is_gpu() && !platform.is_CUDA_gpu()) {
+                        throw std::runtime_error(
+                            "Benchmarking is only supported on CUDA GPUs at the moment");
+                    }
+#ifndef NMODL_LLVM_CUDA_BACKEND
+                    if (platform.is_CUDA_gpu()) {
+                        throw std::runtime_error(
+                            "GPU benchmarking is not supported if NMODL is not built with CUDA "
+                            "backend enabled.");
+                    }
+#endif
+                    const GPUExecutionParameters gpu_execution_parameters{llvm_cuda_grid_dim_x,
+                                                                          llvm_cuda_block_dim_x};
                     benchmark::LLVMBenchmark benchmark(visitor,
                                                        modfile,
                                                        cfg.output_dir,
                                                        cfg.shared_lib_paths,
                                                        num_experiments,
                                                        instance_size,
-                                                       cfg.llvm_cpu_name,
+                                                       platform,
                                                        cfg.llvm_opt_level_ir,
-                                                       cfg.llvm_opt_level_codegen);
+                                                       cfg.llvm_opt_level_codegen,
+                                                       gpu_execution_parameters);
                     benchmark.run();
                 }
             }
diff --git a/src/pybind/CMakeLists.txt b/src/pybind/CMakeLists.txt
index 16f4a586cc..43be3b01a1 100644
--- a/src/pybind/CMakeLists.txt
+++ b/src/pybind/CMakeLists.txt
@@ -73,7 +73,8 @@ if(NMODL_ENABLE_PYTHON_BINDINGS)
 
   # Additional options are needed when LLVM JIT functionality is built
   if(NMODL_ENABLE_LLVM)
-    set_property(TARGET codegen llvm_codegen llvm_benchmark benchmark_data PROPERTY POSITION_INDEPENDENT_CODE ON)
+    set_property(TARGET codegen llvm_codegen llvm_benchmark benchmark_data
+                 PROPERTY POSITION_INDEPENDENT_CODE ON)
     target_link_libraries(_nmodl PRIVATE codegen llvm_codegen llvm_benchmark benchmark_data
                                          ${LLVM_LIBS_TO_LINK})
   endif()
diff --git a/src/pybind/pynmodl.cpp b/src/pybind/pynmodl.cpp
index 9c0d90615f..3fd731738d 100644
--- a/src/pybind/pynmodl.cpp
+++ b/src/pybind/pynmodl.cpp
@@ -151,8 +151,22 @@ class JitDriver {
                                              : nmodl::codegen::PlatformID::GPU;
         const std::string name = cfg.llvm_gpu_name == "default" ? cfg.llvm_cpu_name
                                                                 : cfg.llvm_gpu_name;
-        platform = nmodl::codegen::Platform(
-            pid, name, cfg.llvm_math_library, cfg.llvm_float_type, cfg.llvm_vector_width);
+        platform = nmodl::codegen::Platform(pid,
+                                            name,
+                                            cfg.llvm_gpu_target_architecture,
+                                            cfg.llvm_math_library,
+                                            cfg.llvm_float_type,
+                                            cfg.llvm_vector_width);
+        if (platform.is_gpu() && !platform.is_CUDA_gpu()) {
+            throw std::runtime_error("Benchmarking is only supported on CUDA GPUs at the moment");
+        }
+#ifndef NMODL_LLVM_CUDA_BACKEND
+        if (platform.is_CUDA_gpu()) {
+            throw std::runtime_error(
+                "GPU benchmarking is not supported if NMODL is not built with CUDA "
+                "backend enabled.");
+        }
+#endif
     }
 
   public:
@@ -171,7 +185,9 @@ class JitDriver {
     benchmark::BenchmarkResults run(std::shared_ptr<nmodl::ast::Program> node,
                                     std::string& modname,
                                     int num_experiments,
-                                    int instance_size) {
+                                    int instance_size,
+                                    int cuda_grid_dim_x,
+                                    int cuda_block_dim_x) {
         // New directory is needed to be created otherwise the directory cannot be created
         // automatically through python
         if (cfg.nmodl_ast || cfg.json_ast || cfg.json_perfstat) {
@@ -180,15 +196,17 @@ class JitDriver {
         cg_driver.prepare_mod(node, modname);
         nmodl::codegen::CodegenLLVMVisitor visitor(modname, cfg.output_dir, platform, 0);
         visitor.visit_program(*node);
+        const GPUExecutionParameters gpu_execution_parameters{cuda_grid_dim_x, cuda_block_dim_x};
         nmodl::benchmark::LLVMBenchmark benchmark(visitor,
                                                   modname,
                                                   cfg.output_dir,
                                                   cfg.shared_lib_paths,
                                                   num_experiments,
                                                   instance_size,
-                                                  cfg.llvm_cpu_name,
+                                                  platform,
                                                   cfg.llvm_opt_level_ir,
-                                                  cfg.llvm_opt_level_codegen);
+                                                  cfg.llvm_opt_level_codegen,
+                                                  gpu_execution_parameters);
         return benchmark.run();
     }
 };
@@ -228,7 +246,7 @@ PYBIND11_MODULE(_nmodl, m_nmodl) {
     cfg.def(py::init([]() {
            auto cfg = std::make_unique<nmodl::codegen::CodeGenConfig>();
            // set to more sensible defaults for python binding
-           cfg->llvm_backend = true;
+           cfg->llvm_ir = true;
            return cfg;
        }))
         .def_readwrite("sympy_analytic", &nmodl::codegen::CodeGenConfig::sympy_analytic)
@@ -265,6 +283,8 @@ PYBIND11_MODULE(_nmodl, m_nmodl) {
         .def_readwrite("llvm_fast_math_flags", &nmodl::codegen::CodeGenConfig::llvm_fast_math_flags)
         .def_readwrite("llvm_cpu_name", &nmodl::codegen::CodeGenConfig::llvm_cpu_name)
         .def_readwrite("llvm_gpu_name", &nmodl::codegen::CodeGenConfig::llvm_gpu_name)
+        .def_readwrite("llvm_gpu_target_architecture",
+                       &nmodl::codegen::CodeGenConfig::llvm_gpu_target_architecture)
         .def_readwrite("llvm_vector_width", &nmodl::codegen::CodeGenConfig::llvm_vector_width)
         .def_readwrite("llvm_opt_level_codegen",
                        &nmodl::codegen::CodeGenConfig::llvm_opt_level_codegen)
@@ -278,7 +298,9 @@ PYBIND11_MODULE(_nmodl, m_nmodl) {
              "node"_a,
              "modname"_a,
              "num_experiments"_a,
-             "instance_size"_a);
+             "instance_size"_a,
+             "cuda_grid_dim_x"_a = 1,
+             "cuda_block_dim_x"_a = 1);
 
     m_nmodl.def("to_nmodl",
                 static_cast<std::string (*)(const nmodl::ast::Ast&,
diff --git a/test/benchmark/CMakeLists.txt b/test/benchmark/CMakeLists.txt
index cc3e26bb35..5529b505d2 100644
--- a/test/benchmark/CMakeLists.txt
+++ b/test/benchmark/CMakeLists.txt
@@ -5,12 +5,20 @@ set(LLVM_BENCHMARK_SOURCE_FILES
     ${CMAKE_CURRENT_SOURCE_DIR}/jit_driver.cpp ${CMAKE_CURRENT_SOURCE_DIR}/jit_driver.hpp
     ${CMAKE_CURRENT_SOURCE_DIR}/llvm_benchmark.cpp ${CMAKE_CURRENT_SOURCE_DIR}/llvm_benchmark.hpp)
 
+if(NMODL_ENABLE_LLVM_CUDA)
+  list(APPEND LLVM_BENCHMARK_SOURCE_FILES ${CMAKE_CURRENT_SOURCE_DIR}/cuda_driver.cpp
+       ${CMAKE_CURRENT_SOURCE_DIR}/cuda_driver.hpp)
+endif()
+
 # =============================================================================
 # LLVM benchmark library
 # =============================================================================
 include_directories(${LLVM_INCLUDE_DIRS})
 add_library(llvm_benchmark STATIC ${LLVM_BENCHMARK_SOURCE_FILES})
 add_dependencies(llvm_benchmark lexer util visitor)
+if(NMODL_ENABLE_LLVM_CUDA)
+  target_link_libraries(llvm_benchmark PRIVATE CUDA::cudart CUDA::nvrtc)
+endif()
 
 if(NMODL_ENABLE_JIT_EVENT_LISTENERS)
   target_compile_definitions(llvm_benchmark PUBLIC NMODL_HAVE_JIT_EVENT_LISTENERS)
@@ -22,13 +30,33 @@ endif()
 
 if(NMODL_ENABLE_PYTHON_BINDINGS)
   file(GLOB modfiles "${NMODL_PROJECT_SOURCE_DIR}/test/benchmark/kernels/*.mod")
+  list(APPEND modfiles "${NMODL_PROJECT_SOURCE_DIR}/test/integration/mod/test_math.mod")
   foreach(modfile ${modfiles})
+    # For expsyn.mod set the vector width to 1 since atomic operations are not supported for vector
+    # widths > 1. See https://github.com/BlueBrain/nmodl/issues/857
+    if(${modfile} STREQUAL "${NMODL_PROJECT_SOURCE_DIR}/test/benchmark/kernels/expsyn.mod")
+      set(extra_args "--vec 1")
+    endif()
     get_filename_component(modfile_name "${modfile}" NAME)
     add_test(NAME "PyJIT/${modfile_name}"
              COMMAND ${PYTHON_EXECUTABLE} ${NMODL_PROJECT_SOURCE_DIR}/test/benchmark/benchmark.py
-                     ${modfile})
+                     --file ${modfile} ${extra_args})
     set_tests_properties(
       "PyJIT/${modfile_name}" PROPERTIES ENVIRONMENT
                                          PYTHONPATH=${PROJECT_BINARY_DIR}/lib:$ENV{PYTHONPATH})
+    # Disable running the expsyn.mod on GPU because atomic instructions are not supported yet on GPU
+    # See https://github.com/BlueBrain/nmodl/issues/834
+    if(NMODL_ENABLE_LLVM_CUDA AND NOT ${modfile} STREQUAL "${NMODL_PROJECT_SOURCE_DIR}/test/benchmark/kernels/expsyn.mod")
+      add_test(NAME "PyJIT/${modfile_name}_gpu"
+               COMMAND ${PYTHON_EXECUTABLE} ${NMODL_PROJECT_SOURCE_DIR}/test/benchmark/benchmark.py
+                      --file ${modfile} --gpu ${extra_args})
+      message(STATUS "CUDA_HOME is ${CUDAToolkit_TARGET_DIR}")
+      set_tests_properties(
+        "PyJIT/${modfile_name}_gpu"
+        PROPERTIES
+          ENVIRONMENT
+          "PYTHONPATH=${PROJECT_BINARY_DIR}/lib:$ENV{PYTHONPATH};CUDA_HOME=${CUDAToolkit_TARGET_DIR}"
+      )
+    endif()
   endforeach()
 endif()
diff --git a/test/benchmark/benchmark.py b/test/benchmark/benchmark.py
index c133f8d59c..9144fa549d 100644
--- a/test/benchmark/benchmark.py
+++ b/test/benchmark/benchmark.py
@@ -1,17 +1,39 @@
+import argparse
 import sys
+import os
 
 import nmodl.dsl as nmodl
 from nmodl import ast, visitor
 
+def parse_arguments():
+    parser = argparse.ArgumentParser(description='Benchmark test script for NMODL.')
+    parser.add_argument('--gpu', action='store_true', default=False,
+                        help='Enable GPU JIT execution')
+    parser.add_argument('--vec', type=int, default=1,
+                        help='Vector width for CPU execution')
+    parser.add_argument('--file', type=str,
+                        help='NMODL file to benchmark')
+    args, _ = parser.parse_known_args()
+    return args
+
 def main():
+    args = parse_arguments()
+
     driver = nmodl.NmodlDriver()
     lookup_visitor = visitor.AstLookupVisitor()
 
     cfg = nmodl.CodeGenConfig()
-    cfg.llvm_vector_width = 4
+    cfg.llvm_vector_width = args.vec
     cfg.llvm_opt_level_ir = 2
     cfg.nmodl_ast = True
-    fname = sys.argv[1]
+    fname = args.file
+    if args.gpu:  # GPU enabled
+        cfg.llvm_math_library = "libdevice"
+        cfg.llvm_gpu_name = "nvptx64"
+        cfg.llvm_gpu_target_architecture = "sm_70"
+        if not os.environ.get("CUDA_HOME"):
+            raise RuntimeError("CUDA_HOME environment variable not set")
+        cfg.shared_lib_paths = [os.getenv("CUDA_HOME") + "/nvvm/libdevice/libdevice.10.bc"]
     with open(fname) as f:
         hh = f.read()
         modast = driver.parse_string(hh)
diff --git a/test/benchmark/benchmark_info.hpp b/test/benchmark/benchmark_info.hpp
new file mode 100644
index 0000000000..d02d33ce2e
--- /dev/null
+++ b/test/benchmark/benchmark_info.hpp
@@ -0,0 +1,29 @@
+/*************************************************************************
+ * Copyright (C) 2018-2022 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+/// A struct to hold the information for benchmarking.
+struct BenchmarkInfo {
+    /// Object or PTX filename to dump.
+    std::string filename;
+
+    /// Object file output directory.
+    std::string output_dir;
+
+    /// Shared libraries' paths to link against.
+    std::vector<std::string> shared_lib_paths;
+
+    /// Optimisation level for IT.
+    int opt_level_ir;
+
+    /// Optimisation level for machine code generation.
+    int opt_level_codegen;
+};
diff --git a/test/benchmark/cuda_driver.cpp b/test/benchmark/cuda_driver.cpp
new file mode 100644
index 0000000000..cecc97b35d
--- /dev/null
+++ b/test/benchmark/cuda_driver.cpp
@@ -0,0 +1,201 @@
+/*************************************************************************
+ * Copyright (C) 2018-2022 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#include <fstream>
+#include <regex>
+
+#include "codegen/llvm/codegen_llvm_visitor.hpp"
+#include "codegen/llvm/llvm_utils.hpp"
+#include "cuda_driver.hpp"
+#include "fmt/format.h"
+#include "utils/common_utils.hpp"
+
+#include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/Linker/Linker.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Target/TargetMachine.h"
+
+using fmt::literals::operator""_format;
+
+namespace nmodl {
+namespace runner {
+
+void CUDADriver::checkCudaErrors(CUresult err) {
+    if (err != CUDA_SUCCESS) {
+        const char* ret = NULL;
+        cuGetErrorName(err, &ret);
+        throw std::runtime_error("CUDA error: " + std::string(ret));
+    }
+}
+
+void CUDADriver::link_libraries(llvm::Module& module, BenchmarkInfo* benchmark_info) {
+    llvm::Linker linker(module);
+    for (const auto& lib_path: benchmark_info->shared_lib_paths) {
+        const auto lib_name = lib_path.substr(lib_path.find_last_of("/\\") + 1);
+        std::regex libdevice_bitcode_name{"libdevice.*.bc"};
+        if (!std::regex_match(lib_name, libdevice_bitcode_name)) {
+            throw std::runtime_error("Only libdevice is supported for now");
+        }
+        // Load libdevice module to the LLVM Module
+        auto libdevice_file_memory_buffer = llvm::MemoryBuffer::getFile(lib_path);
+        llvm::Expected<std::unique_ptr<llvm::Module>> libdevice_expected_module =
+            parseBitcodeFile(libdevice_file_memory_buffer->get()->getMemBufferRef(),
+                             module.getContext());
+        if (std::error_code error = errorToErrorCode(libdevice_expected_module.takeError())) {
+            throw std::runtime_error("Error reading bitcode: {}"_format(error.message()));
+        }
+        linker.linkInModule(std::move(libdevice_expected_module.get()),
+                            llvm::Linker::LinkOnlyNeeded);
+    }
+}
+
+void print_string_to_file(const std::string& ptx_compiled_module, const std::string& filename) {
+    std::ofstream ptx_file(filename);
+    ptx_file << ptx_compiled_module;
+    ptx_file.close();
+}
+
+// Converts the CUDA compute version to the CUjit_target enum used by the CUJIT
+CUjit_target get_CUjit_target(const int compute_version_major, const int compute_version_minor) {
+    auto compute_architecture = compute_version_major * 10 + compute_version_minor;
+    switch (compute_architecture) {
+    case 20:
+        return CU_TARGET_COMPUTE_20;
+    case 21:
+        return CU_TARGET_COMPUTE_21;
+    case 30:
+        return CU_TARGET_COMPUTE_30;
+    case 32:
+        return CU_TARGET_COMPUTE_32;
+    case 35:
+        return CU_TARGET_COMPUTE_35;
+    case 37:
+        return CU_TARGET_COMPUTE_37;
+    case 50:
+        return CU_TARGET_COMPUTE_50;
+    case 52:
+        return CU_TARGET_COMPUTE_52;
+    case 53:
+        return CU_TARGET_COMPUTE_53;
+    case 60:
+        return CU_TARGET_COMPUTE_60;
+    case 61:
+        return CU_TARGET_COMPUTE_61;
+    case 62:
+        return CU_TARGET_COMPUTE_62;
+    case 70:
+        return CU_TARGET_COMPUTE_70;
+    case 72:
+        return CU_TARGET_COMPUTE_72;
+    case 75:
+        return CU_TARGET_COMPUTE_75;
+    case 80:
+        return CU_TARGET_COMPUTE_80;
+    case 86:
+        return CU_TARGET_COMPUTE_86;
+    default:
+        throw std::runtime_error("Unsupported compute architecture");
+    }
+}
+
+void CUDADriver::init(const codegen::Platform& platform, BenchmarkInfo* benchmark_info) {
+    // CUDA initialization
+    checkCudaErrors(cuInit(0));
+    checkCudaErrors(cuDeviceGetCount(&device_info.count));
+    checkCudaErrors(cuDeviceGet(&device, 0));
+
+    char name[128];
+    checkCudaErrors(cuDeviceGetName(name, 128, device));
+    device_info.name = name;
+    logger->info("Using CUDA Device [0]: {}"_format(device_info.name));
+
+    // Get the compute capability of the device that is actually going to be used to run the kernel
+    checkCudaErrors(cuDeviceGetAttribute(&device_info.compute_version_major,
+                                         CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR,
+                                         device));
+    checkCudaErrors(cuDeviceGetAttribute(&device_info.compute_version_minor,
+                                         CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR,
+                                         device));
+    logger->info("Device Compute Capability: {}.{}"_format(device_info.compute_version_major,
+                                                           device_info.compute_version_minor));
+    if (device_info.compute_version_major < 2) {
+        throw std::runtime_error("ERROR: Device 0 is not SM 2.0 or greater");
+    }
+
+    // Load the external libraries modules to the NVVM program
+    // Currently only libdevice is supported
+    link_libraries(*module, benchmark_info);
+
+    // Compile the program
+    logger->info("Compiling the LLVM IR to PTX");
+
+    // Optimize code for nvptx including the wrapper functions and generate PTX
+    const auto opt_level_codegen = benchmark_info ? benchmark_info->opt_level_codegen : 0;
+    utils::optimise_module_for_nvptx(platform, *module, opt_level_codegen, ptx_compiled_module);
+    utils::save_ir_to_ll_file(*module,
+                              benchmark_info->output_dir + "/" + benchmark_info->filename +
+                                  "_benchmark");
+    if (benchmark_info) {
+        print_string_to_file(ptx_compiled_module,
+                             benchmark_info->output_dir + "/" + benchmark_info->filename + ".ptx");
+    }
+
+    // Create driver context
+    checkCudaErrors(cuCtxCreate(&context, 0, device));
+
+    // Create module for object
+    logger->info("Loading PTX to CUDA module");
+    const unsigned int jitNumOptions = 5;
+    CUjit_option* jitOptions = new CUjit_option[jitNumOptions];
+    void** jitOptVals = new void*[jitNumOptions];
+
+    // set up size of compilation log buffer
+    jitOptions[0] = CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES;
+    size_t jitLogBufferSize = 1024 * 1024;
+    jitOptVals[0] = (void*) jitLogBufferSize;
+
+    // set up pointer to the compilation log buffer
+    jitOptions[1] = CU_JIT_INFO_LOG_BUFFER;
+    char* jitLogBuffer = new char[jitLogBufferSize];
+    jitOptVals[1] = jitLogBuffer;
+
+    // set up size of compilation error log buffer
+    jitOptions[2] = CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES;
+    size_t jitErrorLogBufferSize = 1024 * 1024;
+    jitOptVals[2] = (void*) jitErrorLogBufferSize;
+
+    // set up pointer to the compilation error log buffer
+    jitOptions[3] = CU_JIT_ERROR_LOG_BUFFER;
+    char* jitErrorLogBuffer = new char[jitErrorLogBufferSize];
+    jitOptVals[3] = jitErrorLogBuffer;
+
+    // set the exact CUDA compute target architecture based on the GPU it's going to be actually
+    // used
+    jitOptions[4] = CU_JIT_TARGET;
+    auto target_architecture = get_CUjit_target(device_info.compute_version_major,
+                                                device_info.compute_version_minor);
+    jitOptVals[4] = (void*) target_architecture;
+
+    // load the LLVM module to the CUDA module (CUDA JIT compilation)
+    auto cuda_jit_ret = cuModuleLoadDataEx(
+        &cudaModule, ptx_compiled_module.c_str(), jitNumOptions, jitOptions, jitOptVals);
+    if (!std::string(jitLogBuffer).empty()) {
+        logger->info("CUDA JIT INFO LOG: {}"_format(std::string(jitLogBuffer)));
+    }
+    if (!std::string(jitErrorLogBuffer).empty()) {
+        logger->info("CUDA JIT ERROR LOG: {}"_format(std::string(jitErrorLogBuffer)));
+    }
+    delete[] jitOptions;
+    delete[] jitOptVals;
+    delete[] jitLogBuffer;
+    delete[] jitErrorLogBuffer;
+    checkCudaErrors(cuda_jit_ret);
+}
+
+}  // namespace runner
+}  // namespace nmodl
diff --git a/test/benchmark/cuda_driver.hpp b/test/benchmark/cuda_driver.hpp
new file mode 100644
index 0000000000..3fd02fd55e
--- /dev/null
+++ b/test/benchmark/cuda_driver.hpp
@@ -0,0 +1,187 @@
+/*************************************************************************
+ * Copyright (C) 2018-2022 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#pragma once
+
+/**
+ * \dir
+ * \brief Implementation of CUDA and NVVM-based execution engine to run functions from MOD files
+ *
+ * \file
+ * \brief \copybrief nmodl::runner::CUDADriver
+ */
+
+#include <memory>
+#include <string>
+
+#include "llvm/IR/Module.h"
+
+#include "benchmark_info.hpp"
+#include "cuda.h"
+#include "cuda_runtime.h"
+#include "gpu_parameters.hpp"
+
+using nmodl::cuda_details::GPUExecutionParameters;
+
+namespace nmodl {
+namespace runner {
+
+struct DeviceInfo {
+    int count;
+    std::string name;
+    int compute_version_major;
+    int compute_version_minor;
+};
+
+/**
+ * @brief Throw meaningful error in case CUDA API call fails
+ *
+ * Checks whether a call to the CUDA API was succsful and if not it throws a runntime_error with
+ * the error message from CUDA.
+ *
+ * @param err Return value of the CUDA API call
+ */
+void checkCudaErrors(CUresult err);
+
+/**
+ * \class CUDADriver
+ * \brief Driver to execute a MOD file function via the CUDA JIT backend.
+ */
+class CUDADriver {
+    /// LLVM IR module to execute.
+    std::unique_ptr<llvm::Module> module;
+    CUdevice device;
+    CUmodule cudaModule;
+    CUcontext context;
+    CUfunction function;
+    CUlinkState linker;
+    DeviceInfo device_info;
+    std::string ptx_compiled_module;
+
+    void checkCudaErrors(CUresult err);
+    void link_libraries(llvm::Module& module, BenchmarkInfo* benchmark_info);
+
+  public:
+    explicit CUDADriver(std::unique_ptr<llvm::Module> m)
+        : module(std::move(m)) {}
+
+    /// Initializes the CUDA GPU JIT driver.
+    void init(const codegen::Platform& platform, BenchmarkInfo* benchmark_info = nullptr);
+
+    void launch_cuda_kernel(const std::string& entry_point,
+                            const GPUExecutionParameters& gpu_execution_parameters,
+                            void* kernel_parameters) {
+        // Get kernel function
+        checkCudaErrors(cuModuleGetFunction(&function, cudaModule, entry_point.c_str()));
+
+        // Kernel launch
+        checkCudaErrors(cuLaunchKernel(function,
+                                       gpu_execution_parameters.gridDimX,
+                                       1,
+                                       1,
+                                       gpu_execution_parameters.blockDimX,
+                                       1,
+                                       1,
+                                       0,
+                                       nullptr,
+                                       &kernel_parameters,
+                                       nullptr));
+        auto asyncErr = cudaDeviceSynchronize();
+        if (asyncErr != cudaSuccess) {
+            throw std::runtime_error(
+                fmt::format("CUDA Execution Error: {}\n", cudaGetErrorString(asyncErr)));
+        }
+    }
+
+    /// Lookups the entry-point without arguments in the CUDA module and executes it.
+    void execute_without_arguments(const std::string& entry_point,
+                                   const GPUExecutionParameters& gpu_execution_parameters) {
+        launch_cuda_kernel(entry_point, gpu_execution_parameters, {});
+    }
+
+    /// Lookups the entry-point with arguments in the CUDA module and executes it.
+    template <typename ArgType>
+    void execute_with_arguments(const std::string& entry_point,
+                                ArgType arg,
+                                const GPUExecutionParameters& gpu_execution_parameters) {
+        launch_cuda_kernel(entry_point, gpu_execution_parameters, {&arg});
+    }
+};
+
+/**
+ * \class BaseGPURunner
+ * \brief A base runner class that provides functionality to execute an
+ * entry point in the CUDA module.
+ */
+class BaseGPURunner {
+  protected:
+    std::unique_ptr<CUDADriver> driver;
+
+    explicit BaseGPURunner(std::unique_ptr<llvm::Module> m)
+        : driver(std::make_unique<CUDADriver>(std::move(m))) {}
+
+  public:
+    /// Sets up the CUDA driver.
+    virtual void initialize_driver(const codegen::Platform& platform) = 0;
+
+    /// Runs the entry-point function without arguments.
+    void run_without_arguments(const std::string& entry_point,
+                               const GPUExecutionParameters& gpu_execution_parameters) {
+        return driver->execute_without_arguments(entry_point, gpu_execution_parameters);
+    }
+
+    /// Runs the entry-point function with a pointer to the data as an argument.
+    template <typename ArgType>
+    void run_with_argument(const std::string& entry_point,
+                           ArgType arg,
+                           const GPUExecutionParameters& gpu_execution_parameters) {
+        return driver->template execute_with_arguments(entry_point, arg, gpu_execution_parameters);
+    }
+};
+
+/**
+ * \class TestGPURunner
+ * \brief A simple runner for testing purposes.
+ */
+class TestGPURunner: public BaseGPURunner {
+  public:
+    explicit TestGPURunner(std::unique_ptr<llvm::Module> m)
+        : BaseGPURunner(std::move(m)) {}
+
+    virtual void initialize_driver(const codegen::Platform& platform) {
+        driver->init(platform);
+    }
+};
+
+/**
+ * \class BenchmarkGPURunner
+ * \brief A runner with benchmarking functionality. It takes user-specified GPU
+ * features into account, as well as it can link against shared libraries.
+ */
+class BenchmarkGPURunner: public BaseGPURunner {
+  private:
+    /// Benchmarking information passed to JIT driver.
+    BenchmarkInfo benchmark_info;
+
+  public:
+    BenchmarkGPURunner(std::unique_ptr<llvm::Module> m,
+                       std::string filename,
+                       std::string output_dir,
+                       std::vector<std::string> lib_paths = {},
+                       int opt_level_ir = 0,
+                       int opt_level_codegen = 0)
+        : BaseGPURunner(std::move(m))
+        , benchmark_info{filename, output_dir, lib_paths, opt_level_ir, opt_level_codegen} {}
+
+    virtual void initialize_driver(const codegen::Platform& platform) {
+        driver->init(platform, &benchmark_info);
+    }
+};
+
+
+}  // namespace runner
+}  // namespace nmodl
diff --git a/test/benchmark/gpu_parameters.hpp b/test/benchmark/gpu_parameters.hpp
new file mode 100644
index 0000000000..5e72edb147
--- /dev/null
+++ b/test/benchmark/gpu_parameters.hpp
@@ -0,0 +1,27 @@
+/*************************************************************************
+ * Copyright (C) 2018-2022 Blue Brain Project
+ *
+ * This file is part of NMODL distributed under the terms of the GNU
+ * Lesser General Public License. See top-level LICENSE file for details.
+ *************************************************************************/
+
+#pragma once
+
+/**
+ * \dir
+ * \brief GPU execution parameters struct
+ *
+ * \file
+ * \brief \copybrief nmodl::cuda_details::GPUExecutionParameters
+ */
+
+namespace nmodl {
+namespace cuda_details {
+
+struct GPUExecutionParameters {
+    int gridDimX;
+    int blockDimX;
+};
+
+}  // namespace cuda_details
+}  // namespace nmodl
diff --git a/test/benchmark/jit_driver.hpp b/test/benchmark/jit_driver.hpp
index ed86684f76..3569c4bd4f 100644
--- a/test/benchmark/jit_driver.hpp
+++ b/test/benchmark/jit_driver.hpp
@@ -15,6 +15,8 @@
  * \brief \copybrief nmodl::runner::JITDriver
  */
 
+#include "benchmark_info.hpp"
+
 #include "llvm/ExecutionEngine/JITEventListener.h"
 #include "llvm/ExecutionEngine/Orc/LLJIT.h"
 #include "llvm/Support/Host.h"
@@ -22,24 +24,6 @@
 namespace nmodl {
 namespace runner {
 
-/// A struct to hold the information for benchmarking.
-struct BenchmarkInfo {
-    /// Object filename to dump.
-    std::string filename;
-
-    /// Object file output directory.
-    std::string output_dir;
-
-    /// Shared libraries' paths to link against.
-    std::vector<std::string> shared_lib_paths;
-
-    /// Optimisation level for IT.
-    int opt_level_ir;
-
-    /// Optimisation level for machine code generation.
-    int opt_level_codegen;
-};
-
 /**
  * \class JITDriver
  * \brief Driver to execute a MOD file function via LLVM IR backend.
diff --git a/test/benchmark/kernels/expsyn.mod b/test/benchmark/kernels/expsyn.mod
new file mode 100644
index 0000000000..56ddde3b19
--- /dev/null
+++ b/test/benchmark/kernels/expsyn.mod
@@ -0,0 +1,42 @@
+NEURON {
+	POINT_PROCESS ExpSyn
+	RANGE tau, e, i
+	NONSPECIFIC_CURRENT i
+}
+
+UNITS {
+	(nA) = (nanoamp)
+	(mV) = (millivolt)
+	(uS) = (microsiemens)
+}
+
+PARAMETER {
+	tau = 0.1 (ms) <1e-9,1e9>
+	e = 0	(mV)
+}
+
+ASSIGNED {
+	v (mV)
+	i (nA)
+}
+
+STATE {
+	g (uS)
+}
+
+INITIAL {
+	g=0
+}
+
+BREAKPOINT {
+	SOLVE state METHOD cnexp
+	i = g*(v - e)
+}
+
+DERIVATIVE state {
+	g' = -g/tau
+}
+
+NET_RECEIVE(weight (uS)) {
+	g = g + weight
+}
diff --git a/test/benchmark/kernels/hh.mod b/test/benchmark/kernels/hh.mod
new file mode 100644
index 0000000000..053a15f43f
--- /dev/null
+++ b/test/benchmark/kernels/hh.mod
@@ -0,0 +1,125 @@
+TITLE hh.mod   squid sodium, potassium, and leak channels
+ 
+COMMENT
+ This is the original Hodgkin-Huxley treatment for the set of sodium, 
+  potassium, and leakage channels found in the squid giant axon membrane.
+  ("A quantitative description of membrane current and its application 
+  conduction and excitation in nerve" J.Physiol. (Lond.) 117:500-544 (1952).)
+ Membrane voltage is in absolute mV and has been reversed in polarity
+  from the original HH convention and shifted to reflect a resting potential
+  of -65 mV.
+ Remember to set celsius=6.3 (or whatever) in your HOC file.
+ See squid.hoc for an example of a simulation using this model.
+ SW Jaslove  6 March, 1992
+ENDCOMMENT
+ 
+UNITS {
+        (mA) = (milliamp)
+        (mV) = (millivolt)
+	(S) = (siemens)
+}
+ 
+? interface
+NEURON {
+        SUFFIX hh
+        USEION na READ ena WRITE ina
+        USEION k READ ek WRITE ik
+        NONSPECIFIC_CURRENT il
+        RANGE gnabar, gkbar, gl, el, gna, gk
+        :GLOBAL minf, hinf, ninf, mtau, htau, ntau
+        RANGE minf, hinf, ninf, mtau, htau, ntau
+	THREADSAFE : assigned GLOBALs will be per thread
+}
+ 
+PARAMETER {
+        gnabar = .12 (S/cm2)	<0,1e9>
+        gkbar = .036 (S/cm2)	<0,1e9>
+        gl = .0003 (S/cm2)	<0,1e9>
+        el = -54.3 (mV)
+}
+ 
+STATE {
+        m h n
+}
+ 
+ASSIGNED {
+        v (mV)
+        celsius (degC)
+        ena (mV)
+        ek (mV)
+
+	gna (S/cm2)
+	gk (S/cm2)
+        ina (mA/cm2)
+        ik (mA/cm2)
+        il (mA/cm2)
+        minf hinf ninf
+	mtau (ms) htau (ms) ntau (ms)
+}
+ 
+? currents
+BREAKPOINT {
+        SOLVE states METHOD cnexp
+        gna = gnabar*m*m*m*h
+	ina = gna*(v - ena)
+        gk = gkbar*n*n*n*n
+	ik = gk*(v - ek)      
+        il = gl*(v - el)
+}
+ 
+ 
+INITIAL {
+	rates(v)
+	m = minf
+	h = hinf
+	n = ninf
+}
+
+? states
+DERIVATIVE states {  
+        rates(v)
+        m' =  (minf-m)/mtau
+        h' = (hinf-h)/htau
+        n' = (ninf-n)/ntau
+}
+ 
+:LOCAL q10
+
+
+? rates
+PROCEDURE rates(v(mV)) {  :Computes rate and other constants at current v.
+                      :Call once from HOC to initialize inf at resting v.
+        LOCAL  alpha, beta, sum, q10
+:        TABLE minf, mtau, hinf, htau, ninf, ntau DEPEND celsius FROM -100 TO 100 WITH 200
+
+UNITSOFF
+        q10 = 3^((celsius - 6.3)/10)
+                :"m" sodium activation system
+        alpha = .1 * vtrap(-(v+40),10)
+        beta =  4 * exp(-(v+65)/18)
+        sum = alpha + beta
+	mtau = 1/(q10*sum)
+        minf = alpha/sum
+                :"h" sodium inactivation system
+        alpha = .07 * exp(-(v+65)/20)
+        beta = 1 / (exp(-(v+35)/10) + 1)
+        sum = alpha + beta
+	htau = 1/(q10*sum)
+        hinf = alpha/sum
+                :"n" potassium activation system
+        alpha = .01*vtrap(-(v+55),10) 
+        beta = .125*exp(-(v+65)/80)
+	sum = alpha + beta
+        ntau = 1/(q10*sum)
+        ninf = alpha/sum
+}
+ 
+FUNCTION vtrap(x,y) {  :Traps for 0 in denominator of rate eqns.
+        if (fabs(x/y) < 1e-6) {
+                vtrap = y*(1 - x/y/2)
+        }else{
+                vtrap = x/(exp(x/y) - 1)
+        }
+}
+ 
+UNITSON
diff --git a/test/benchmark/llvm_benchmark.cpp b/test/benchmark/llvm_benchmark.cpp
index 87d7e34512..010bc2edf3 100644
--- a/test/benchmark/llvm_benchmark.cpp
+++ b/test/benchmark/llvm_benchmark.cpp
@@ -16,6 +16,9 @@
 
 #include "test/unit/codegen/codegen_data_helper.hpp"
 
+#ifdef NMODL_LLVM_CUDA_BACKEND
+#include "test/benchmark/cuda_driver.hpp"
+#endif
 
 namespace nmodl {
 namespace benchmark {
@@ -45,17 +48,43 @@ BenchmarkResults LLVMBenchmark::run_benchmark() {
     llvm_visitor.find_kernel_names(kernel_names);
 
     // Get feature's string and turn them off depending on the cpu.
-    std::string cpu_name = cpu == "default" ? llvm::sys::getHostCPUName().str() : cpu;
-    logger->info("CPU: {}", cpu_name);
+    std::string backend_name;
+#ifdef NMODL_LLVM_CUDA_BACKEND
+    if (platform.is_CUDA_gpu()) {
+        backend_name = platform.get_name();
+    } else {
+#endif
+        backend_name = platform.get_name() == "default" ? llvm::sys::getHostCPUName().str()
+                                                        : platform.get_name();
+#ifdef NMODL_LLVM_CUDA_BACKEND
+    }
+#endif
+    logger->info("Backend: {}", backend_name);
 
     std::unique_ptr<llvm::Module> m = llvm_visitor.get_module();
 
     // Create the benchmark runner and initialize it.
-    std::string filename = "v" + std::to_string(llvm_visitor.get_vector_width()) + "_" +
-                           mod_filename;
-    runner::BenchmarkRunner runner(
-        std::move(m), filename, output_dir, cpu_name, shared_libs, opt_level_ir, opt_level_codegen);
-    runner.initialize_driver();
+#ifdef NMODL_LLVM_CUDA_BACKEND
+    if (platform.is_CUDA_gpu()) {
+        std::string filename = "cuda_" + mod_filename;
+        cuda_runner = std::make_unique<runner::BenchmarkGPURunner>(
+            std::move(m), filename, output_dir, shared_libs, opt_level_ir, opt_level_codegen);
+        cuda_runner->initialize_driver(platform);
+    } else {
+#endif
+        std::string filename = "v" + std::to_string(llvm_visitor.get_vector_width()) + "_" +
+                               mod_filename;
+        cpu_runner = std::make_unique<runner::BenchmarkRunner>(std::move(m),
+                                                               filename,
+                                                               output_dir,
+                                                               backend_name,
+                                                               shared_libs,
+                                                               opt_level_ir,
+                                                               opt_level_codegen);
+        cpu_runner->initialize_driver();
+#ifdef NMODL_LLVM_CUDA_BACKEND
+    }
+#endif
 
     BenchmarkResults results{};
     // Benchmark every kernel.
@@ -75,7 +104,17 @@ BenchmarkResults LLVMBenchmark::run_benchmark() {
             // Record the execution time of the kernel.
             std::string wrapper_name = "__" + kernel_name + "_wrapper";
             auto start = std::chrono::steady_clock::now();
-            runner.run_with_argument<int, void*>(kernel_name, instance_data.base_ptr);
+#ifdef NMODL_LLVM_CUDA_BACKEND
+            if (platform.is_CUDA_gpu()) {
+                cuda_runner->run_with_argument<void*>(wrapper_name,
+                                                      instance_data.base_ptr,
+                                                      gpu_execution_parameters);
+            } else {
+#endif
+                cpu_runner->run_with_argument<int, void*>(wrapper_name, instance_data.base_ptr);
+#ifdef NMODL_LLVM_CUDA_BACKEND
+            }
+#endif
             auto end = std::chrono::steady_clock::now();
             std::chrono::duration<double> diff = end - start;
 
diff --git a/test/benchmark/llvm_benchmark.hpp b/test/benchmark/llvm_benchmark.hpp
index f79cad62e5..f03e9ea52d 100644
--- a/test/benchmark/llvm_benchmark.hpp
+++ b/test/benchmark/llvm_benchmark.hpp
@@ -13,6 +13,16 @@
 #include <tuple>
 
 #include "codegen/llvm/codegen_llvm_visitor.hpp"
+#include "gpu_parameters.hpp"
+#include "test/benchmark/jit_driver.hpp"
+#include "utils/logger.hpp"
+
+#ifdef NMODL_LLVM_CUDA_BACKEND
+#include "test/benchmark/cuda_driver.hpp"
+#endif
+
+using nmodl::codegen::Platform;
+using nmodl::cuda_details::GPUExecutionParameters;
 
 namespace nmodl {
 namespace benchmark {
@@ -47,8 +57,11 @@ class LLVMBenchmark {
     /// The size of the instance struct for benchmarking.
     int instance_size;
 
-    /// CPU to target.
-    std::string cpu;
+    /// Target platform for the code generation.
+    Platform platform;
+
+    /// The GPU execution parameters needed to configure the kernels' execution.
+    GPUExecutionParameters gpu_execution_parameters;
 
     /// Optimisation level for IR generation.
     int opt_level_ir;
@@ -59,6 +72,14 @@ class LLVMBenchmark {
     /// Filestream for dumping logs to the file.
     std::ofstream ofs;
 
+    /// CPU benchmark runner
+    std::unique_ptr<runner::BenchmarkRunner> cpu_runner;
+
+#ifdef NMODL_LLVM_CUDA_BACKEND
+    /// CUDA benchmark runner
+    std::unique_ptr<runner::BenchmarkGPURunner> cuda_runner;
+#endif
+
   public:
     LLVMBenchmark(codegen::CodegenLLVMVisitor& llvm_visitor,
                   const std::string& mod_filename,
@@ -66,7 +87,7 @@ class LLVMBenchmark {
                   std::vector<std::string> shared_libs,
                   int num_experiments,
                   int instance_size,
-                  const std::string& cpu,
+                  const Platform& platform,
                   int opt_level_ir,
                   int opt_level_codegen)
         : llvm_visitor(llvm_visitor)
@@ -75,9 +96,29 @@ class LLVMBenchmark {
         , shared_libs(shared_libs)
         , num_experiments(num_experiments)
         , instance_size(instance_size)
-        , cpu(cpu)
+        , platform(platform)
         , opt_level_ir(opt_level_ir)
         , opt_level_codegen(opt_level_codegen) {}
+    LLVMBenchmark(codegen::CodegenLLVMVisitor& llvm_visitor,
+                  const std::string& mod_filename,
+                  const std::string& output_dir,
+                  std::vector<std::string> shared_libs,
+                  int num_experiments,
+                  int instance_size,
+                  const Platform& platform,
+                  int opt_level_ir,
+                  int opt_level_codegen,
+                  const GPUExecutionParameters& gpu_exec_params)
+        : llvm_visitor(llvm_visitor)
+        , mod_filename(mod_filename)
+        , output_dir(output_dir)
+        , shared_libs(shared_libs)
+        , num_experiments(num_experiments)
+        , instance_size(instance_size)
+        , platform(platform)
+        , opt_level_ir(opt_level_ir)
+        , opt_level_codegen(opt_level_codegen)
+        , gpu_execution_parameters(gpu_exec_params) {}
 
     /// Runs the benchmark.
     BenchmarkResults run();
diff --git a/test/integration/mod/test_math.mod b/test/integration/mod/test_math.mod
new file mode 100644
index 0000000000..6e3174a846
--- /dev/null
+++ b/test/integration/mod/test_math.mod
@@ -0,0 +1,16 @@
+NEURON {
+    SUFFIX test
+    RANGE x, y
+}
+
+ASSIGNED { x y }
+
+STATE { m }
+
+BREAKPOINT {
+    SOLVE states METHOD cnexp
+}
+
+DERIVATIVE states {
+    m = exp(y) + x ^ 107  + log(x)
+}
diff --git a/test/unit/CMakeLists.txt b/test/unit/CMakeLists.txt
index 107d856d74..818167859c 100644
--- a/test/unit/CMakeLists.txt
+++ b/test/unit/CMakeLists.txt
@@ -117,7 +117,6 @@ if(NMODL_ENABLE_LLVM)
   add_executable(test_llvm_runner visitor/main.cpp codegen/codegen_data_helper.cpp
                                   codegen/codegen_llvm_execution.cpp)
   if(NMODL_ENABLE_LLVM_CUDA)
-    include_directories(${CUDAToolkit_INCLUDE_DIRS})
     target_link_libraries(benchmark_data PRIVATE CUDA::cudart)
     target_link_libraries(testllvm CUDA::cudart)
     target_link_libraries(test_llvm_runner CUDA::cudart)
diff --git a/test/unit/codegen/codegen_llvm_instance_struct.cpp b/test/unit/codegen/codegen_llvm_instance_struct.cpp
index 401e0a6c63..9c22fdda78 100644
--- a/test/unit/codegen/codegen_llvm_instance_struct.cpp
+++ b/test/unit/codegen/codegen_llvm_instance_struct.cpp
@@ -120,11 +120,15 @@ SCENARIO("Instance Struct creation", "[visitor][llvm][instance_struct]") {
             size_t ion_ena_index_index = 8;
             size_t voltage_index = 9;
             size_t node_index_index = 10;
-            size_t t_index = 11;
-            size_t dt_index = 12;
-            size_t celsius_index = 13;
-            size_t secondorder_index = 14;
-            size_t node_count_index = 15;
+            size_t rhs_index = 11;
+            size_t d_index = 12;
+            size_t rhs_shadow_index = 13;
+            size_t d_shadow_index = 14;
+            size_t t_index = 15;
+            size_t dt_index = 16;
+            size_t celsius_index = 17;
+            size_t secondorder_index = 18;
+            size_t node_count_index = 19;
             // Check if the various instance struct fields are properly initialized
             REQUIRE(compare(instance_data.members[minf_index],
                             generate_dummy_data<double>(minf_index, num_elements)));
@@ -155,6 +159,10 @@ SCENARIO("Instance Struct creation", "[visitor][llvm][instance_struct]") {
                 int* ion_ena_index;
                 double* voltage;
                 int* node_index;
+                double* vec_rhs;
+                double* vec_d;
+                double* _shadow_rhs;
+                double* _shadow_d;
                 double t;
                 double dt;
                 double celsius;
diff --git a/test/unit/codegen/codegen_llvm_ir.cpp b/test/unit/codegen/codegen_llvm_ir.cpp
index b19ff95066..ebef71688e 100644
--- a/test/unit/codegen/codegen_llvm_ir.cpp
+++ b/test/unit/codegen/codegen_llvm_ir.cpp
@@ -980,8 +980,8 @@ SCENARIO("Scalar state kernel", "[visitor][llvm]") {
             // Check the struct type with correct attributes and the kernel declaration.
             std::regex struct_type(
                 "%.*__instance_var__type = type \\{ double\\*, double\\*, double\\*, double\\*, "
-                "double\\*, double\\*, double\\*, double\\*, double\\*, double\\*, i32\\*, double, "
-                "double, double, i32, i32, double\\*, double\\*, double\\*, double\\* \\}");
+                "double\\*, double\\*, double\\*, double\\*, double\\*, double\\*, i32\\*, "
+                "double\\*, double\\*, double\\*, double\\*, double, double, double, i32, i32 \\}");
             std::regex kernel_declaration(
                 R"(define void @nrn_state_hh\(%.*__instance_var__type\* noalias nocapture readonly .*\) #0)");
             REQUIRE(std::regex_search(module_string, m, struct_type));
@@ -1775,7 +1775,7 @@ SCENARIO("GPU kernel body IR generation", "[visitor][llvm][gpu]") {
             }
 
             DERIVATIVE states {
-              m = exp(y) + x ^ 2
+              m = exp(y) + x ^ 2 + log(x)
             }
         )";
 
@@ -1793,12 +1793,18 @@ SCENARIO("GPU kernel body IR generation", "[visitor][llvm][gpu]") {
             std::regex pow_declaration(R"(declare double @__nv_pow\(double, double\))");
             std::regex pow_new_call(R"(call double @__nv_pow\(double %.*, double .*\))");
             std::regex pow_old_call(R"(call double @llvm\.pow\.f64\(double %.*, double .*\))");
+            std::regex log_declaration(R"(declare double @__nv_log\(double\))");
+            std::regex log_new_call(R"(call double @__nv_log\(double %.*\))");
+            std::regex log_old_call(R"(call double @llvm\.log\.f64\(double %.*\))");
             REQUIRE(std::regex_search(module_string, m, exp_declaration));
             REQUIRE(std::regex_search(module_string, m, exp_new_call));
             REQUIRE(!std::regex_search(module_string, m, exp_old_call));
             REQUIRE(std::regex_search(module_string, m, pow_declaration));
             REQUIRE(std::regex_search(module_string, m, pow_new_call));
             REQUIRE(!std::regex_search(module_string, m, pow_old_call));
+            REQUIRE(std::regex_search(module_string, m, log_declaration));
+            REQUIRE(std::regex_search(module_string, m, log_new_call));
+            REQUIRE(!std::regex_search(module_string, m, log_old_call));
         }
     }
 }
diff --git a/test/unit/codegen/codegen_llvm_visitor.cpp b/test/unit/codegen/codegen_llvm_visitor.cpp
index 1906d0d27c..af9bed5e7c 100644
--- a/test/unit/codegen/codegen_llvm_visitor.cpp
+++ b/test/unit/codegen/codegen_llvm_visitor.cpp
@@ -171,15 +171,15 @@ SCENARIO("Check instance struct declaration and setup in wrapper",
                 int* __restrict__ ion_dikdv_index;
                 double* __restrict__ voltage;
                 int* __restrict__ node_index;
+                double* __restrict__ vec_rhs;
+                double* __restrict__ vec_d;
+                double* __restrict__ _shadow_rhs;
+                double* __restrict__ _shadow_d;
                 double t;
                 double dt;
                 double celsius;
                 int secondorder;
                 int node_count;
-                double* __restrict__ vec_rhs;
-                double* __restrict__ vec_d;
-                double* __restrict__ _shadow_rhs;
-                double* __restrict__ _shadow_d;
             };
         )";
         std::string generated_instance_struct_setup = R"(
@@ -226,6 +226,10 @@ SCENARIO("Check instance struct declaration and setup in wrapper",
                 inst->ion_dikdv_index = indexes+5*pnodecount;
                 inst->voltage = nt->_actual_v;
                 inst->node_index = ml->nodeindices;
+                inst->vec_rhs = nt->_actual_rhs;
+                inst->vec_d = nt->_actual_d;
+                inst->_shadow_rhs = nt->_shadow_rhs;
+                inst->_shadow_d = nt->_shadow_d;
                 inst->t = nt->t;
                 inst->dt = nt->dt;
                 inst->celsius = celsius;