Skip to content

Commit

Permalink
vortex 64-bit support fix
Browse files Browse the repository at this point in the history
  • Loading branch information
tinebp committed Jun 14, 2024
1 parent 81d0aee commit a957807
Show file tree
Hide file tree
Showing 4 changed files with 46 additions and 33 deletions.
4 changes: 4 additions & 0 deletions lib/CL/devices/vortex/kernel_args.h
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,7 @@ typedef struct {
uint32_t global_offset[3];
uint32_t kernel_id;
} kernel_args_t;

inline uint32_t alignOffset(uint32_t offset, uint32_t alignment) {
return (offset + alignment - 1) & ~(alignment - 1);
}
4 changes: 2 additions & 2 deletions lib/CL/devices/vortex/kernel_main.c
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ int main(void) {
for (int i = 0, n = kargs->work_dim; i < 3; i++) {
g_global_offset.m[i] = (i < n) ? kargs->global_offset[i] : 0;
}

void* arg = (void*)((uint8_t*)kargs + sizeof(kernel_args_t));
uint32_t aligned_kernel_args_size = alignOffset(sizeof(kernel_args_t), sizeof(size_t));
void* arg = (void*)((uint8_t*)kargs + aligned_kernel_args_size);
vx_kernel_func_cb kernel_func = (vx_kernel_func_cb)__vx_get_kernel_callback(kargs->kernel_id);
return vx_spawn_threads(kargs->work_dim, kargs->num_groups, kargs->local_size, kernel_func, arg);
}
59 changes: 31 additions & 28 deletions lib/CL/devices/vortex/pocl-vortex.c
Original file line number Diff line number Diff line change
Expand Up @@ -179,7 +179,7 @@ pocl_vortex_init (unsigned j, cl_device_id dev, const char* parameters)

dev->llvm_cpu = NULL;
dev->address_bits = is_64bit ? 64 : 32;
dev->llvm_target_triplet = is_64bit ? "riscv64-unknown-unknown" : "riscv32-unknown-unknown";
dev->llvm_target_triplet = is_64bit ? "riscv64-unknown-unknown-elf" : "riscv32-unknown-unknown-elf";
dev->llvm_abi = is_64bit ? "lp64d" : "ilp32f";
dev->llvm_cpu = is_64bit ? "generic-rv64" : "generic-rv32";
dev->kernellib_name = is_64bit ? "kernel-riscv64" : "kernel-riscv32";
Expand Down Expand Up @@ -424,9 +424,9 @@ void pocl_vortex_run (void *data, _cl_command_node *cmd) {
struct pocl_context *pc = &cmd->command.run.pc;
int vx_err;

int num_groups = 1;
int group_size = 1;
for (int i = 0; i < pc->work_dim; ++i) {
uint32_t num_groups = 1;
uint32_t group_size = 1;
for (uint32_t i = 0; i < pc->work_dim; ++i) {
num_groups *= pc->num_groups[i];
group_size *= pc->local_size[i];
}
Expand All @@ -436,37 +436,39 @@ void pocl_vortex_run (void *data, _cl_command_node *cmd) {
assert (data != NULL);
dd = (vortex_device_data_t *)data;

int ptr_size = dd->is_64bit ? 8 : 4;
uint32_t ptr_size = dd->is_64bit ? 8 : 4;

uint32_t aligned_kernel_args_size = alignOffset(sizeof(kernel_args_t), ptr_size);

// calculate kernel arguments buffer size
int local_mem_size = 0;
uint32_t local_mem_size = 0;
size_t abuf_size = 0;

for (int i = 0; i < meta->num_args; ++i) {
struct pocl_argument* al = &(cmd->command.run.arguments[i]);
if (ARG_IS_LOCAL(meta->arg_info[i])) {
local_mem_size += al->size;
abuf_size += 4;
abuf_size = alignOffset(abuf_size + 4, ptr_size);
} else
if ((meta->arg_info[i].type == POCL_ARG_TYPE_POINTER)
|| (meta->arg_info[i].type == POCL_ARG_TYPE_IMAGE)
|| (meta->arg_info[i].type == POCL_ARG_TYPE_SAMPLER)) {
abuf_size += ptr_size;
abuf_size = alignOffset(abuf_size + ptr_size, ptr_size);
} else {
// scalar argument
abuf_size += al->size;
abuf_size = alignOffset(abuf_size + al->size, ptr_size);
}
}

// local buffers
for (int i = 0; i < meta->num_locals; ++i) {
local_mem_size += meta->local_sizes[i];
abuf_size += 4;
abuf_size = alignOffset(abuf_size + 4, ptr_size);
}

// add local size
if (local_mem_size != 0) {
abuf_size += 4;
abuf_size = alignOffset(abuf_size + 4, ptr_size);
}

// check occupancy
Expand All @@ -483,7 +485,7 @@ void pocl_vortex_run (void *data, _cl_command_node *cmd) {
}

// allocate arguments host buffer
size_t kargs_buffer_size = sizeof(kernel_args_t) + abuf_size;
size_t kargs_buffer_size = aligned_kernel_args_size + abuf_size;
uint8_t* const host_kargs_base_ptr = malloc(kargs_buffer_size);
assert(host_kargs_base_ptr);

Expand Down Expand Up @@ -514,30 +516,31 @@ void pocl_vortex_run (void *data, _cl_command_node *cmd) {

// write arguments

uint8_t* host_args_ptr = host_kargs_base_ptr + sizeof(kernel_args_t);
int local_mem_offset = 0;
uint8_t* const host_args_ptr = host_kargs_base_ptr + aligned_kernel_args_size;
uint32_t host_args_offset = 0;
uint32_t local_mem_offset = 0;

for (int i = 0; i < meta->num_args; ++i) {
struct pocl_argument* al = &(cmd->command.run.arguments[i]);
if (ARG_IS_LOCAL(meta->arg_info[i])) {
if (local_mem_offset == 0) {
memcpy(host_args_ptr, &local_mem_size, 4); // local_size
host_args_ptr += 4;
memcpy(host_args_ptr + host_args_offset, &local_mem_size, 4); // local_size
host_args_offset = alignOffset(host_args_offset + 4, ptr_size);
}
memcpy(host_args_ptr, &local_mem_offset, 4); // arg offset
host_args_ptr += 4;
memcpy(host_args_ptr + host_args_offset, &local_mem_offset, 4); // arg offset
host_args_offset = alignOffset(host_args_offset + 4, ptr_size);
local_mem_offset += al->size;
} else
if (meta->arg_info[i].type == POCL_ARG_TYPE_POINTER) {
if (al->value == NULL) {
memset(host_args_ptr, 0, ptr_size); // NULL pointer value
host_args_ptr += ptr_size;
memset(host_args_ptr + host_args_offset, 0, ptr_size); // NULL pointer value
host_args_offset = alignOffset(host_args_offset + ptr_size, ptr_size);
} else {
cl_mem m = (*(cl_mem *)(al->value));
vortex_buffer_data_t* buf_data = (vortex_buffer_data_t *) m->device_ptrs[cmd->device->global_mem_id].mem_ptr;
uint64_t dev_mem_addr = buf_data->buf_address + al->offset;
memcpy(host_args_ptr, &buf_data->buf_address, ptr_size); // pointer value
host_args_ptr += ptr_size;
memcpy(host_args_ptr + host_args_offset, &buf_data->buf_address, ptr_size); // pointer value
host_args_offset = alignOffset(host_args_offset + ptr_size, ptr_size);
}
} else
if (meta->arg_info[i].type == POCL_ARG_TYPE_IMAGE) {
Expand All @@ -547,19 +550,19 @@ void pocl_vortex_run (void *data, _cl_command_node *cmd) {
POCL_ABORT("POCL_VORTEX_RUN\n");
} else {
// scalar argument
memcpy(host_args_ptr, al->value, al->size); // scalar value
host_args_ptr += al->size;
memcpy(host_args_ptr + host_args_offset, al->value, al->size); // scalar value
host_args_offset = alignOffset(host_args_offset + al->size, ptr_size);
}
}

// write local arguments
for (int i = 0; i < meta->num_locals; ++i) {
if (local_mem_offset == 0) {
memcpy(host_args_ptr, &local_mem_size, 4); // local_size
host_args_ptr += 4;
memcpy(host_args_ptr + host_args_offset, &local_mem_size, 4); // local_size
host_args_offset = alignOffset(host_args_offset + 4, ptr_size);
}
memcpy(host_args_ptr, &local_mem_offset, 4); // arg offset
host_args_ptr += 4;
memcpy(host_args_ptr + host_args_offset, &local_mem_offset, 4); // arg offset
host_args_offset = alignOffset(host_args_offset + 4, ptr_size);
local_mem_offset += meta->local_sizes[i];
}

Expand Down
12 changes: 9 additions & 3 deletions lib/CL/devices/vortex/vortex_utils.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
#include "pocl_util.h"

#include "LLVMUtils.h"
#include "kernel_args.h"

static int exec(const char* cmd, std::ostream& out) {
char buffer[128];
Expand Down Expand Up @@ -99,6 +100,9 @@ static bool createArgumentsBuffer(llvm::Function *function, llvm::Module *module
auto &Context = module->getContext();
const llvm::DataLayout &DL = module->getDataLayout();

std::string TargetTriple = module->getTargetTriple();
bool is64Bit = TargetTriple.find("riscv64") != std::string::npos;

auto I32Ty = llvm::Type::getInt32Ty(Context);
auto I8Ty = llvm::Type::getInt8Ty(Context);
auto I8PtrTy = I8Ty->getPointerTo();
Expand All @@ -124,6 +128,8 @@ static bool createArgumentsBuffer(llvm::Function *function, llvm::Module *module

auto MDS = llvm::MDNode::get(Context, llvm::MDString::get(Context, "vortex.uniform"));

uint32_t BaseAlignment = is64Bit ? 8 : 4;

for (auto& OldArg : function->args()) {
auto ArgType = OldArg.getType();
auto ArgOffset = llvm::ConstantInt::get(I32Ty, arg_offset);
Expand All @@ -132,8 +138,8 @@ static bool createArgumentsBuffer(llvm::Function *function, llvm::Module *module
if (allocated_local_mem == nullptr) {
// Load __local_size
auto local_size_ptr = Builder.CreateGEP(I8Ty, ArgBuffer, ArgOffset, "__local_size_ptr");
arg_offset += 4;
auto local_size = Builder.CreateLoad(I32Ty, local_size_ptr, "__local_size");
arg_offset = alignOffset(arg_offset + 4, BaseAlignment);
// Call vx_local_alloc(__local_size)
auto function_type = llvm::FunctionType::get(I8PtrTy, {I32Ty}, false);
auto vx_local_alloc_func = module->getOrInsertFunction("vx_local_alloc", function_type);
Expand All @@ -142,13 +148,13 @@ static bool createArgumentsBuffer(llvm::Function *function, llvm::Module *module
// Load argument __offset
auto offset_ptr = Builder.CreateGEP(I8Ty, ArgBuffer, ArgOffset, OldArg.getName() + "_offset_ptr");
auto offset = Builder.CreateLoad(I32Ty, offset_ptr, OldArg.getName() + "_offset");
arg_offset += 4;
arg_offset = alignOffset(arg_offset + 4, BaseAlignment);
// Apply pointer offset
Arg = Builder.CreateGEP(I8PtrTy, allocated_local_mem, offset, OldArg.getName() + "_byte_ptr");
} else {
auto offset_ptr = Builder.CreateGEP(I8Ty, ArgBuffer, ArgOffset, OldArg.getName() + "_offset_ptr");
Arg = Builder.CreateLoad(ArgType, offset_ptr, OldArg.getName() + "_loaded");
arg_offset += DL.getTypeAllocSize(ArgType);
arg_offset = alignOffset(arg_offset + DL.getTypeAllocSize(ArgType), BaseAlignment);
}
auto instr = llvm::cast<llvm::Instruction>(Arg);
assert(instr != nullptr);
Expand Down

0 comments on commit a957807

Please sign in to comment.