Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Hipfort #6

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
94 changes: 92 additions & 2 deletions 3d/hip/core_hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -179,9 +179,99 @@ __global__ void evolve_kernel(double *currdata, double *prevdata, double a, doub
}
}

void evolve(Field& curr, Field& prev, const double a, const double dt)
// Update the temperature values using five-point stencil */
__global__ void evolve_kernel2(double *currdata, double *prevdata, double a, double dt, int nx, int ny, int nz,
double inv_dx2, double inv_dy2, double inv_dz2, int max_threads)
{

//linearize the thread id to have coalesced accesses
int ythr_offs = threadIdx.y * blockDim.x + threadIdx.x;
int zthr_offs = threadIdx.z * blockDim.y * blockDim.x+ ythr_offs;
int xblock_offs = blockIdx.x * blockDim.z * blockDim.y * blockDim.x + zthr_offs;
int yblock_offs = blockIdx.y * blockDim.z * blockDim.y * blockDim.x * gridDim.x + xblock_offs;
int tid = blockIdx.z * blockDim.z * blockDim.y * blockDim.x * gridDim.x * gridDim.y + yblock_offs;


//i only want to run significative computations, so i need to map the tid to the real memory addresses
//this means that i have some "holes" where the halo is mapped.
//to avoid too many uncoalesced accesses, we decide to "mask" the small skip (i.e. the halo on the central dimension)
//but we will avoid to evaluate the big skip (i.e. the halo surface when we change on the third dimension)
int start_offs = (ny + 2) * (nx + 2) + (nx + 2);// 1 face + 1 line are the halos, must not be computated.
int big_skip_size = (ny + 2) + (nx + 2) ;//not completely sure, could be 2* nx+2 or 2*ny+2
int num_big_skips = tid / ((nx+2)*ny); //again not completely sure, could be 2nx or 2ny
int ind = tid + start_offs + num_big_skips * big_skip_size;


//keep load coalesced (i.e. do load also for thread that must not compute and write)
auto tmpdata = prevdata[ind];
auto ip = prevdata[ind + ((nx+2)*(ny+2))];
auto im = prevdata[ind - ((nx+2)*(ny+2))];
auto jp = prevdata[ind + (nx+2)];
auto jm = prevdata[ind - (nx+2)];
auto kp = prevdata[ind + 1];
auto km = prevdata[ind - 1];


//here we really exclude from the computation the "small skips", they are the first and the last on the innermost dimension
//e.g., if nx is 4, we need to exclude tid 0,5,6,11,12,... because there is where the halo is.
if (!((tid % (nx+2)) == 0 || ((tid - (nx+1)) % (nx+2)) == 0) && tid < max_threads)

//these printfs could be useful to understand the mapping of tid to memory idx.
//to use them you need to reduce the total dimension of the problem to something manageable (e.g. 4,4,4)
//and initialize the grid values with understandable numbers (i.e. 100*x+10*y+z)
//in this way you will have a set of print with all the "active" indexes

/*printf ("thread is block %d,%d,%d, is thread %d,%d,%d, tid %d, ind is %d, and value %f \n ",
(int)blockIdx.x,(int)blockIdx.y,(int)blockIdx.z,
(int)threadIdx.x,(int)threadIdx.y,(int)threadIdx.z,
tid,ind,
prevdata[ind]);*/
/*printf("ind is %f and my halo are %f %f %f %f %f %f \n",
prevdata[ind],
prevdata[ind+(nx+2)], //y
prevdata[ind-(nx+2)], //y
prevdata[ind+((nx+2)*(ny+2))], //x //i think the x-z inversion is due to fortran col major
prevdata[ind-((nx+2)*(ny+2))], //x //i think the x-z inversion is due to fortran col major
prevdata[ind+1], //z //i think the x-z inversion is due to fortran col major
prevdata[ind-1] //z //i think the x-z inversion is due to fortran col major
);*/

tmpdata = tmpdata + a * dt * (
( ip - 2.0 * tmpdata + im ) * inv_dz2 + //c++ matrix, row major. "farthest" point is the z dimension
( jp - 2.0 * tmpdata + jm ) * inv_dy2 +
( kp - 2.0 * tmpdata + km ) * inv_dx2
);
//keep store coalesced. note that we used tmpdata local variable only for this reason.
//halo points will load and store it, without updating.
currdata[ind] = tmpdata;


}


void evolve(Field& curr, Field& prev, const double a, const double dt)
{
//note that in the innermost dimension we do nx +2 to insert the "small skips"
//in the set of threads running. ny and nz don't need to be increased.
int max_threads = (curr.nx+2)* (curr.ny) * (curr.nz);
int blocks,grids;

//i'd like to use the optimal occupancy but the API seems to be broken.
//I'm not sure what is happening there but cannot use that
//hipOccupancyMaxPotentialBlockSize(&grids,&blocks,evolve_kernel, 0, max_threads);

//lets assign a block size of 256, this parameter can be played with.
blocks = 256;
grids = (max_threads + blocks -1) / blocks;

auto inv_dx2 = 1.0 / (prev.dx * prev.dx);
auto inv_dy2 = 1.0 / (prev.dy * prev.dy);
auto inv_dz2 = 1.0 / (prev.dz * prev.dz);

hipLaunchKernelGGL((evolve_kernel2), grids,blocks, 0, 0, curr.devdata(), prev.devdata(), a, dt, curr.nx, curr.ny, curr.nz,inv_dx2,inv_dy2,inv_dz2, max_threads );

//old evolve kernel here. just swap the comments.
/*
int nx = curr.nx;
int ny = curr.ny;
int nz = curr.nz;
Expand Down Expand Up @@ -225,7 +315,7 @@ void evolve(Field& curr, Field& prev, const double a, const double dt)
inv_dx2, inv_dy2, inv_dz2);
hipDeviceSynchronize();
CHECK_ERROR_MSG("evolve");

*/
}

void evolve_interior(Field& curr, Field& prev, const double a, const double dt, hipStream_t *streams)
Expand Down
45 changes: 45 additions & 0 deletions 3d/hipfort/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
ifeq ($(COMP),)
COMP=cray
endif

export HIPFORT_HOME=./hipfort/hipfortbin
include Makefile.hipfort



EXE=heat_hipfort
OBJS=main.o kernel.o heat_mod.o core.o utilities.o io.o setup.o

all: $(EXE)

hipfort:
@git clone https://github.com/ROCmSoftwarePlatform/hipfort.git


./hipfort/hipfortbin/lib/libhipfort-amdgcn.a: hipfort
@echo "running the command"
@bash install_hipfort.sh $(COMP)

kernel.o: kernel.cpp ./hipfort/hipfortbin/lib/libhipfort-amdgcn.a
heat_mod.o: heat_mod.F90 ./hipfort/hipfortbin/lib/libhipfort-amdgcn.a
core.o: core.F90 heat_mod.o ./hipfort/hipfortbin/lib/libhipfort-amdgcn.a
utilities.o: utilities.F90 heat_mod.o ./hipfort/hipfortbin/lib/libhipfort-amdgcn.a
io.o: io.F90 heat_mod.o ./hipfort/hipfortbin/lib/libhipfort-amdgcn.a
setup.o: setup.F90 heat_mod.o utilities.o io.o ./hipfort/hipfortbin/lib/libhipfort-amdgcn.a
main.o: main.F90 heat_mod.o core.o io.o setup.o utilities.o ./hipfort/hipfortbin/lib/libhipfort-amdgcn.a



$(EXE): $(OBJS) hipfort/hipfortbin/lib/libhipfort-amdgcn.a
$(FC) $(OBJS) -o $@ $(LDFLAGS) $(LIBS) $(LINKOPTS)

%.o: %.cpp
$(CXX) -c $< -o $@

%.o: %.F90
$(FC) -DGPU_MPI -c $< -o $@


.PHONY: clean
clean:
-/bin/rm -f $(EXE) a.out *.o *.mod *.png *~
51 changes: 51 additions & 0 deletions 3d/hipfort/Makefile.hipfort
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@

#
# Makefile.hipfort: Include Makefile to set values LINKOPTS, CXX, and FC
# to support compilation with HIPFORT.
#
# This file is meant to be included by other Makefiles. The includer must
# set HIPFORT_HOME to the build or install location of HIPFORT. For example:
#
# HIPFORT_HOME ?= /opt/rocm/hipfort
# include $(HIPFORT_HOME)/bin/Makefile.hipfort
#
# If the caller does not set HIPFORT_ARCHGPU, this Makefile will call
# a self-identification utility called myarchgpu. Please create an issue
# in the hipfort repo if myarchgpu returns unknown.
#
# To avoid self identification set HIPFORT_ARCHGPU. Examples:
# HIPFORT_ARCHGPU = amdgcn-gfx906
# HIPFORT_ARCHGPU = nvptx-sm_70
#

export ROCM_PATH=/opt/rocm
HIPFORT_ARCHGPU ?= $(shell $(HIPFORT_HOME)/libexec/hipfort/myarchgpu)
ARCH = $(firstword $(subst -, ,$(HIPFORT_ARCHGPU)))
#HIPFORT_COMPILER ?= /opt/cray/pe/gcc/11.2.0/bin/gfortran
#HIPFORT_COMPILER ?= /opt/cray/pe/cce/14.0.2/bin/crayftn
HIPFORT_COMPILER ?= ftn
#CUDA_PATH=${CUDA_PATH:-/usr/local/cuda}
#ROCM_PATH=${ROCM_PATH:-/opt/rocm}
DEVICE_LIB_PATH ?= $(ROCM_PATH)/lib
HIP_CLANG_PATH ?= $(ROCM_PATH)/llvm/bin
HIP_PLATFORM=${HIP_PLATFORM:-amd}

MOD_DIR = $(HIPFORT_HOME)/include/hipfort/$(ARCH)
LIB_DIR = $(HIPFORT_HOME)/lib

GPU = $(strip $(subst ., ,$(suffix $(subst -,.,$(HIPFORT_ARCHGPU)))))
ifeq (nvptx,$(findstring nvptx,$(HIPFORT_ARCHGPU)))
UNAMEP = $(shell uname -p)
LINKOPTS = -L$(LIB_DIR) -lhipfort-$(ARCH) -L$(CUDA_PATH)/targets/$(UNAMEP)-linux/lib $(LINKOPTS) -lcudart
HIPCC_ENV = HIP_PLATFORM=nvcc
HIPCC_OPTS = "--gpu-architecture=$(GPU)"
else
LINKOPTS = -L$(LIB_DIR) -lhipfort-$(ARCH) -L$(ROCM_PATH)/lib $(HIPCC_LIBS) -lamdhip64 -Wl,-rpath=$(ROCM_PATH)/lib
HIPCC_ENV = HIP_PLATFORM=$(HIP_PLATFORM) DEVICE_LIB_PATH=$(DEVICE_LIB_PATH) HIP_CLANG_PATH=$(HIP_CLANG_PATH)
HIPCC_OPTS = -fno-gpu-rdc -fPIC --offload-arch=$(GPU)
endif

LINKOPTS += -lstdc++
CXX = $(ROCM_PATH)/bin/hipcc $(HIPCC_OPTS)
FC_OPTS = "-DHIPFORT_ARCH=\"$(ARCH)\""
FC = $(HIPFORT_COMPILER) -Rb -I$(MOD_DIR) $(FC_OPTS)
97 changes: 97 additions & 0 deletions 3d/hipfort/core.F90
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
! Main solver routines for heat equation solver
module core
use heat
use iso_c_binding
use hipfort
use hipfort_check

!binding the evolve function to the c wrapper that includes the cpp/hip kernel.
!note that I'm working with the field_c struct, that is a struct declared in both c and fortran
!that contains the details of the field plus a c_ptr that is pointing to the DEVICE memory.
!kernel details:
! Compute one time step of temperature evolution
! Arguments:
! curr (type(field_c)): current temperature values
! prev (type(field_c)): values from previous time step
! a (real(c_double)): update equation constant
! dt (real(c_double)): time step value

interface
subroutine evolve(curr, prev, a, dt) , bind(c,name="evolve")
use iso_c_binding
import field_c
implicit none

type(field_c), target, intent(inout) :: curr, prev
real(c_double),value :: a, dt

end subroutine evolve
end interface


contains

! Exchange the boundary data between MPI tasks
subroutine exchange(field0, parallel)
use mpi

implicit none

type(field), target, intent(inout) :: field0
type(parallel_data), intent(in) :: parallel

!!could be a cray issue. have to check with other compilers
real(c_double), dimension(:,:,:),allocatable, target :: data !!!!!! THIS MUST BE ALLOCATABLE NOT POINTER FOR MULTIDIMENSIONAL THINGS !!!!!!!

integer :: buf_size

integer :: ierr

call c_f_pointer (field0%inner_field%data , data, shape(field0%data))
!write(*,*) '#@#@#@#@ data pointer shape is ', shape(data), 'old ptr shape is ', shape(field0%data)
buf_size = (field0%inner_field%nx + 2) * (field0%inner_field%ny + 2)

!#ifdef GPU_MPI
!for now let's suppose that it's always device2device
!TODO manage data transfer via host

! Send to left, receive from right
call mpi_sendrecv(data(:, :, 1), buf_size, MPI_DOUBLE_PRECISION, &
& parallel%nleft, 11, &
& data(:, :, field0%inner_field%nz+2), buf_size, MPI_DOUBLE_PRECISION, &
& parallel%nright, 11, &
& MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)

! Send to right, receive from left
call mpi_sendrecv(data(:, :, field0%inner_field%nz+1), buf_size, MPI_DOUBLE_PRECISION, &
& parallel%nright, 12, &
& data(:, :, 1), buf_size, MPI_DOUBLE_PRECISION,&
& parallel%nleft, 12, &
& MPI_COMM_WORLD, MPI_STATUS_IGNORE, ierr)
!write(15,*) '########mpi stuff here, data pointer is at ', c_loc(data(1,1,1)), 'doublecheck ', c_loc(data), 'triplecheck', field0%inner_field%data
!write(15,*) '########mpi stuff here, second element is at ', c_loc(data(1,1,2)), 'or ', c_loc(data(2,1,1))
!write(15,*) 'size of data is', size(data), 'address of last thing is ', c_loc(data(6,6,4)), 'and it contains ',data(6,6,4)
!write(15,*) 'first element over data is ',c_loc(data(7,6,4)), 'that contains ',data(7,6,4), ' or ', c_loc(data(6,6,5)),' that contains ', data(6,6,5) !want to be sure they are out of bounds.
!write(15,*) 'first sendrecv data (left send, right recv), send ',c_loc(data(:,:,2)),' recv ', c_loc(data(:, :, field0%inner_field%nz + 2))
!write(15,*) 'second sendrecv data, send ',c_loc(data(:, :, field0%inner_field%nz+1)),' recv ', c_loc(data(:, :, 1))
!write(15,*) 'field host data is at ', c_loc(field0%data), ' and device data is at ', field0%inner_field%data, ' data pointer is at ', c_loc(data)
!write(15,*) 'buf size is ', buf_size
!write(15,*) 'shape of data is ', shape(data)
!call flush (15)
!write(15,*) 'buf_size', buf_size
!call flush (15)
!write(15,*) 'precision', MPI_DOUBLE_PRECISION
!call flush (15)
!write(15,*) 'right proc' ,parallel%nright
!call flush (15)
!write(15,*) 'pointer' ,data(:, :, 1)
!call flush (15)
!write(15,*) 'left', parallel%nleft
!call flush (15)

!write(15,*) 'second done'


end subroutine exchange

end module core
Loading